In [None]:
# steps:
# go to this link https://www.kaggle.com/datasets/berkeleyearth/climate-change-earth-surface-temperature-data?select=GlobalLandTemperaturesByCity.csv
# download the dataset GlobalLandTemperaturesByCity.csv
# click on the little file on the left and upload GlobalLandTemperaturesByCity.csv

In [None]:
import pandas as pd
df = pd.read_csv("GlobalLandTemperaturesByCity.csv")
df = df.drop(["AverageTemperatureUncertainty", "Latitude", "Longitude"],  axis='columns')
df = df.dropna()
df

Unnamed: 0,dt,AverageTemperature,City,Country
0,1743-11-01,6.068,Århus,Denmark
5,1744-04-01,5.788,Århus,Denmark
6,1744-05-01,10.644,Århus,Denmark
7,1744-06-01,14.051,Århus,Denmark
8,1744-07-01,16.082,Århus,Denmark
...,...,...,...,...
8599206,2013-04-01,7.710,Zwolle,Netherlands
8599207,2013-05-01,11.464,Zwolle,Netherlands
8599208,2013-06-01,15.043,Zwolle,Netherlands
8599209,2013-07-01,18.775,Zwolle,Netherlands


In [None]:
df[["Year", "Month", "Day"]] = df["dt"].str.split("-", expand = True)
df = df.drop(['dt'], axis="columns")
# convert date columns to numbers
df[["Year", "Month", "Day"]] = df[["Year", "Month", "Day"]].apply(pd.to_numeric)
df

Unnamed: 0,AverageTemperature,City,Country,Year,Month,Day
0,6.068,Århus,Denmark,1743,11,1
5,5.788,Århus,Denmark,1744,4,1
6,10.644,Århus,Denmark,1744,5,1
7,14.051,Århus,Denmark,1744,6,1
8,16.082,Århus,Denmark,1744,7,1
...,...,...,...,...,...,...
8599206,7.710,Zwolle,Netherlands,2013,4,1
8599207,11.464,Zwolle,Netherlands,2013,5,1
8599208,15.043,Zwolle,Netherlands,2013,6,1
8599209,18.775,Zwolle,Netherlands,2013,7,1


In [None]:
# only look at data for this year
df = df.loc[df['Year'] >= 2000]
df

Unnamed: 0,AverageTemperature,City,Country,Year,Month,Day
3074,3.065,Århus,Denmark,2000,1,1
3075,3.724,Århus,Denmark,2000,2,1
3076,3.976,Århus,Denmark,2000,3,1
3077,8.321,Århus,Denmark,2000,4,1
3078,13.567,Århus,Denmark,2000,5,1
...,...,...,...,...,...,...
8599206,7.710,Zwolle,Netherlands,2013,4,1
8599207,11.464,Zwolle,Netherlands,2013,5,1
8599208,15.043,Zwolle,Netherlands,2013,6,1
8599209,18.775,Zwolle,Netherlands,2013,7,1


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

#DATA PREPROCESSING:

# Split the data into features (X) and target (y)
X = df.drop(["AverageTemperature"], axis="columns")
y = df["AverageTemperature"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Turn categorical columns "City" and "Country" into label-encoded columns
encoder = LabelEncoder()
X_train[['City', 'Country']] = X_train[['City', 'Country']].apply(encoder.fit_transform)
X_test[['City', 'Country']] = X_test[['City', 'Country']].apply(encoder.fit_transform)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# reduce dimensionality with PCA (not needed since we dont use one-hot encoding)
# pca = PCA(n_components=100)
# X_train = pca.fit_transform(X_train)
# X_test = pca.fit_transform(X_test)

pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4
0,0.444141,-0.121564,1.683249,0.464478,0.0
1,0.200177,1.578308,-0.088669,0.755788,0.0
2,0.360811,1.092631,-1.607455,-0.700763,0.0
3,-1.207384,-0.541013,1.430118,-0.409453,0.0
4,0.681077,-1.666902,-0.848062,0.173168,0.0


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
# MODEL BUILDING:

# chose the model to use
models = {
  "Linear" : LinearRegression(),
  "K Nearest Neighbors": KNeighborsRegressor(),
  "KNN with GridSearchCV": GridSearchCV(KNeighborsRegressor(), {"n_neighbors":range(1,25,4)}, cv=5, scoring="neg_mean_squared_error"),
  # "Support Vector Machine": SVR(),
  "Random Forest": RandomForestRegressor()
}
for model_name, model in models.items():
  # create model and fit it with training data
  model.fit(X_train, y_train)

  # make model predictions with testing data
  y_pred = model.predict(X_test)

  # use mse to check how good our test predictions were against real y data
  mse = mean_squared_error(y_test, y_pred)
  rmse = mean_squared_error(y_test, y_pred, squared=False)
  r2 = r2_score(y_test, y_pred)
  print("Stats for model: ", model_name)
  print("Mean Squared Error:", mse)
  print("Root Mean Squared Error", rmse)
  print("R2 Score", r2)
  if "GridSearchCV" in model_name:
    # Print best params for gridsearch model
    print("GridSearchCV Best Estimator: ", model.best_estimator_)
    print("GridSearchCV Best Params: ", model.best_params_)
  print("\n")

Stats for model:  Linear
Mean Squared Error: 96.40387665808774
Root Mean Squared Error 9.818547583939681
R2 Score 0.02370323146976494


Stats for model:  K Nearest Neighbors
Mean Squared Error: 44.7134166657624
Root Mean Squared Error 6.6868091542799695
R2 Score 0.5471804069087973


Stats for model:  KNN with GridSearchCV
Mean Squared Error: 44.40069486254846
Root Mean Squared Error 6.663384640147112
R2 Score 0.5503473883260441
GridSearchCV Best Estimator:  KNeighborsRegressor(n_neighbors=9)
GridSearchCV Best Params:  {'n_neighbors': 9}


