In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor
from math import sqrt
from pandas import date_range
from collections import deque
import warnings

In [3]:
warnings.filterwarnings("ignore")

In [4]:
def wrangle(path):
    df = pd.read_csv(path, parse_dates=['dt'])
    df['Year'] = pd.to_datetime(df['dt']).dt.year
    df['Month'] = pd.to_datetime(df['dt']).dt.month
    df.set_index('dt', inplace=True)
    df.dropna(inplace=True)

    cols = ['LandAndOceanAverageTemperature', 'LandAndOceanAverageTemperatureUncertainty',
             'LandMaxTemperatureUncertainty', 'LandMinTemperatureUncertainty',
             'LandAverageTemperatureUncertainty']
    df.drop(columns=cols, inplace=True) 


    df['lag_1'] = df['LandAverageTemperature'].shift(1)
    df['lag_2'] = df['LandAverageTemperature'].shift(2)
    df['lag_3'] = df['LandAverageTemperature'].shift(3)
    df.dropna(inplace=True)
    return df

In [5]:
df = wrangle('data/GlobalTemperatures.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1925 entries, 1850-04-01 to 2010-08-01
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   LandMaxTemperature      1925 non-null   float64
 1   LandMinTemperature      1925 non-null   float64
 2   LandAverageTemperature  1925 non-null   float64
 3   Year                    1925 non-null   int32  
 4   Month                   1925 non-null   int32  
 5   lag_1                   1925 non-null   float64
 6   lag_2                   1925 non-null   float64
 7   lag_3                   1925 non-null   float64
dtypes: float64(6), int32(2)
memory usage: 120.3 KB


In [7]:
X = df.drop(columns='LandAverageTemperature')
y = df['LandAverageTemperature']

In [8]:
model = RandomForestRegressor()
model.fit(X, y)

In [9]:
feat_imp = pd.Series(model.feature_importances_, index=X.columns)
print(feat_imp.sort_values(ascending=False))

LandMinTemperature    0.837283
LandMaxTemperature    0.159060
Year                  0.001421
lag_1                 0.000704
lag_3                 0.000696
lag_2                 0.000574
Month                 0.000261
dtype: float64


In [10]:
tscv = TimeSeriesSplit(n_splits=10)
fold_lr = 0
rmse_scores = []

for train_index, test_index in tscv.split(X):
    fold_lr+=1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_lr = LinearRegression()
    model_lr.fit(X_train, y_train)

    y_pred =  model_lr.predict(X_test)

    rmse = sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

print("Average RMSE over all folds:", np.mean(rmse_scores))


Average RMSE over all folds: 0.24076727149910088


In [11]:
fold_rf = 0
rf_rmse = []

for train_index, test_index in tscv.split(X):
    fold_lr+=1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
    model_rf.fit(X_train, y_train)
    predictions = model_rf.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    rf_rmse.append(rmse)

print(f"Random Forest RMSE:", np.mean(rf_rmse))

Random Forest RMSE: 0.20822085364478138


In [12]:
fold_gb = 0
gb_rmse = []

for train_index, test_index in tscv.split(X):
    fold_gb+=1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    model_gb.fit(X_train, y_train)
    predictions = model_gb.predict(X_test)
    
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    gb_rmse.append(rmse)

print(f"Gradient Boosting RMSE:", np.mean(gb_rmse))

Gradient Boosting RMSE: 0.2531111457158288


In [13]:
rf_rmse

[0.5189945814657958,
 0.3569962861327439,
 0.20635578086263673,
 0.16547918313966772,
 0.20963621344878217,
 0.10781560727464233,
 0.12365281337218836,
 0.10782598027774738,
 0.1466623810135777,
 0.13878970946003172]

In [14]:
importances = model_rf.feature_importances_
features = X.columns

pd.Series(importances, index=features)

LandMaxTemperature    0.139159
LandMinTemperature    0.856496
Year                  0.001402
Month                 0.000371
lag_1                 0.000848
lag_2                 0.000616
lag_3                 0.001108
dtype: float64

In [15]:
Xx = df[['Month']]
y_max = df['LandMaxTemperature']
y_min = df['LandMinTemperature']

In [16]:
model_max = LinearRegression().fit(Xx, y_max)
model_min = LinearRegression().fit(Xx, y_min)

In [51]:
def predict_temperatures(n):
    # Create a monthly date range (each date is the start of the month) starting from September 2010.
    dates = pd.date_range(start='2010-09-01', periods=n, freq='MS')
    
    # Build a DataFrame with the basic features: Year, Month.
    forecast_df = pd.DataFrame({
        'Year': [d.year for d in dates],
        'Month': [d.month for d in dates]
    })
    
    # Predict LandMaxTemperature and LandMinTemperature for each forecast row.
    forecast_df['LandMaxTemperature'] = model_max.predict(forecast_df[['Month']])
    forecast_df['LandMinTemperature'] = model_min.predict(forecast_df[['Month']])
    
    # Add empty columns for the lags.
    forecast_df['lag_1'] = None
    forecast_df['lag_2'] = None
    forecast_df['lag_3'] = None

    # Set the first row’s lag features using historical values from your dataset.
    # (For example, taking the last observed values from df.)
    forecast_df.loc[0, 'lag_1'] = df['LandAverageTemperature'].iloc[-1]
    forecast_df.loc[0, 'lag_2'] = df['lag_1'].iloc[-1]
    forecast_df.loc[0, 'lag_3'] = df['lag_2'].iloc[-1]
    
    # Prepare a list to store your final predictions.
    predictions = []
    
    # The order of features expected by your final model.
    feature_order = ['LandMaxTemperature', 'LandMinTemperature', 'Year', 'Month', 'lag_1', 'lag_2', 'lag_3']
    
    # Iterate through the forecast rows.
    for i in range(len(forecast_df)):
        # Retrieve the current row’s feature values.
        row = forecast_df.iloc[i]
        current_features = [row[col] for col in feature_order]
        
        # Make sure all feature values are numeric (if not, convert as needed).
        # For example, if the lags are stored as None, you might need to handle that.
        # (Here we assume that for i == 0 the values are set from df, and for i > 0 they will be updated below.)
        
        # Predict the temperature for this period using your final model.
        pred_temp = model_lr.predict([current_features])[0]
        predictions.append(pred_temp)
        
        # Update the lag features for the *next* forecast row (if it exists) using the actual prediction.
        if i + 1 < len(forecast_df):
            # For a simple autoregressive scheme:
            # - Next row’s lag_1 will be the current period’s prediction.
            # - Next row’s lag_2 will be the current period’s lag_1.
            # - Next row’s lag_3 will be the current period’s lag_2.
            forecast_df.loc[i+1, 'lag_1'] = pred_temp
            forecast_df.loc[i+1, 'lag_2'] = row['lag_1']
            forecast_df.loc[i+1, 'lag_3'] = row['lag_2']
    
    # Optionally, you can add the predictions as a new column in your forecast DataFrame.
    forecast_df['PredictedTemperature'] = predictions
    
    return predictions, forecast_df


In [52]:
forecast_df, predictions = predict_temperatures(20)

In [53]:
forecast_df

[10.133280168756244,
 8.794188551015676,
 8.611397614015948,
 9.548982432836947,
 7.498897012495614,
 7.145598738655053,
 7.076670637185231,
 7.6289880257631815,
 8.09100664373671,
 8.487480577457674,
 8.757190519144494,
 9.001936412956107,
 9.247933694632959,
 9.515328206489762,
 9.793568926399185,
 10.075304515689586,
 7.558912034965109,
 6.972751361765139,
 6.93500119577182,
 7.57187140006668]

In [54]:
predictions

Unnamed: 0,Year,Month,LandMaxTemperature,LandMinTemperature,lag_1,lag_2,lag_3,PredictedTemperature
0,2010,9,15.090498,3.573815,14.768,15.213,14.421,10.13328
1,2010,10,15.396142,3.917363,10.13328,14.768,15.213,8.794189
2,2010,11,15.701787,4.26091,8.794189,10.13328,14.768,8.611398
3,2010,12,16.007431,4.604457,8.611398,8.794189,10.13328,9.548982
4,2011,1,12.645344,0.825438,9.548982,8.611398,8.794189,7.498897
5,2011,2,12.950989,1.168985,7.498897,9.548982,8.611398,7.145599
6,2011,3,13.256633,1.512532,7.145599,7.498897,9.548982,7.076671
7,2011,4,13.562277,1.85608,7.076671,7.145599,7.498897,7.628988
8,2011,5,13.867921,2.199627,7.628988,7.076671,7.145599,8.091007
9,2011,6,14.173565,2.543174,8.091007,7.628988,7.076671,8.487481
