In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import warnings
from flaml import AutoML
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('data/GlobalTemperatures.csv', parse_dates=['dt'])
df.dropna(inplace=True)
df.head()

Unnamed: 0,dt,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty,LandAverageTemperature
1200,1850-01-01,1.105,8.242,1.738,-3.206,2.822,12.833,0.367,0.749
1201,1850-02-01,1.275,9.97,3.007,-2.291,1.623,13.588,0.414,3.071
1202,1850-03-01,0.955,10.347,2.401,-1.905,1.41,14.043,0.341,4.954
1203,1850-04-01,0.665,12.934,1.004,1.018,1.329,14.667,0.267,7.217
1204,1850-05-01,0.617,15.655,2.406,3.811,1.347,15.507,0.249,10.004


In [4]:
df['year'] = df['dt'].dt.year
df['month'] = df['dt'].dt.month
df['date_ordinal'] = df['dt'].map(datetime.toordinal)


In [5]:
X = df[['year', 'month', 'date_ordinal']]
y = df['LandAverageTemperature']

In [6]:
df.head()

Unnamed: 0,dt,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty,LandAverageTemperature,year,month,date_ordinal
1200,1850-01-01,1.105,8.242,1.738,-3.206,2.822,12.833,0.367,0.749,1850,1,675334
1201,1850-02-01,1.275,9.97,3.007,-2.291,1.623,13.588,0.414,3.071,1850,2,675365
1202,1850-03-01,0.955,10.347,2.401,-1.905,1.41,14.043,0.341,4.954,1850,3,675393
1203,1850-04-01,0.665,12.934,1.004,1.018,1.329,14.667,0.267,7.217,1850,4,675424
1204,1850-05-01,0.617,15.655,2.406,3.811,1.347,15.507,0.249,10.004,1850,5,675454


In [7]:
# Determine split index
split_index = int(len(X) * 0.8)

# Train set: First 80%
X_train, y_train, X_test, y_test = X[:split_index], y[:split_index], X[split_index:], y[split_index:]

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 1542, Test size: 386


In [8]:
model = AutoML()
model.fit(X_train, y_train, task='regression', time_budget=80)

[flaml.automl.logger: 02-12 11:40:34] {1728} INFO - task = regression
[flaml.automl.logger: 02-12 11:40:34] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 02-12 11:40:34] {1838} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 02-12 11:40:34] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 02-12 11:40:34] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 02-12 11:40:34] {2393} INFO - Estimated sufficient time budget=3411s. Estimated necessary time budget=30s.
[flaml.automl.logger: 02-12 11:40:34] {2442} INFO -  at 0.5s,	estimator lgbm's best error=0.4979,	best estimator lgbm's best error=0.4979
[flaml.automl.logger: 02-12 11:40:34] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 02-12 11:40:34] {2442} INFO -  at 0.7s,	estimator lgbm's best error=0.4979,	best estimator lgbm's best error=0.4979
[flaml.automl.logger: 02-12 11:40

In [9]:
print(model.best_estimator)

catboost


In [10]:
y_pred = model.predict(X_test)

In [11]:
y_mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {y_mse}")

Mean Squared Error: 0.34929684383816506


In [12]:
def predict_temperature(n):
    dates = pd.date_range(start='2010-09-01', periods=n, freq='M')
    forecast_df = pd.DataFrame({'year': dates.year, 'month': dates.month, 'date_ordinal': dates.map(datetime.toordinal)})
    return model.predict(forecast_df)

In [37]:
test = predict_temperature(12)

  dates = pd.date_range(start='2010-09-01', periods=n, freq='M')


In [38]:
test

array([12.02548021,  9.38097354,  6.12567443,  3.85735838,  2.78716722,
        3.39856327,  5.44138532,  8.65420658, 11.44839509, 13.51888581,
       14.29647984, 13.86593159])