In [None]:
import pandas as pd
import numpy as np
import os
import math
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from seaborn import heatmap

In [1]:
from math import sqrt

In [None]:



from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import statsmodels.api as sm
from statsmodels.tsa.api import Holt

import warnings
warnings.filterwarnings("ignore")

# acquire

In [None]:
df = pd.read_csv('GlobalLandTemperaturesByMajorCity.csv')

# prepare

In [None]:
df.head(1)

In [None]:
# clean data
df.columns = [col.lower() for col in df]

In [None]:
# rename
df = df.rename(columns= {'dt':'datetime','AverageTemperature':'avg_temp',"averagetemperatureuncertainty":'uncertainty'})
df = df.drop(columns= ['latitude','longitude','country'])

In [None]:
df

In [None]:
df.datetime = pd.to_datetime(df.datetime)

In [None]:
df = df.set_index('datetime').sort_values(by = 'datetime')

In [None]:
dfc = df[df.city == 'Chicago'] 
dfc = dfc.drop(columns= ['city'])

In [None]:
dfc.isnull().value_counts()

In [None]:
dfc.shape

In [None]:
print('Number of rows:', df.index.nunique())
n_days = df.index.max() - df.index.min() + pd.Timedelta('1d')
print(f"Number of days between first and last day:", n_days)


In [None]:
dfc.index.max()

In [None]:
dfc.index.min()

In [None]:
# dfc = dfc.dropna()

<div class="alert alert-block alert-info">
<b>Take Aways:</b> I dropped 98 nulls, i think i want to forecast the next two weeks temperatures, using data from 1980 to 2013, 2013 being my test data.


In [None]:
dfc = dfc['1960':]

In [None]:
print('Number of rows:', dfc.index.nunique())
n_days = dfc.index.max() - dfc.index.min() + pd.Timedelta('1d')
print(f"Number of days between first and last day:", n_days)


In [None]:
dfc

In [None]:
# rename
dfc = dfc.rename(columns= {'averagetemperature':'avg_temp'})
dfc = dfc.rename(columns= {'averagetemperatureuncertainty':'uncertainty'})

In [None]:
dfc.plot(y = "avg_temp")
dfc.plot(y = "uncertainty")

<div class="alert alert-block alert-info">
<b>Take Aways:</b> it looks like `avg_temp` and `uncertainty` are both rising gradually

In [None]:
print('Number of rows:', dfc.index.nunique())
n_days = dfc.index.max() - dfc.index.min() + pd.Timedelta('1d')
print(f"Number of days between first and last day:", n_days)

In [None]:
dfc.index.max(),dfc.index.min()

# percentage-based split

In [None]:
train_size = int(len(dfc) * .5)
validate_size = int(len(dfc) * .3)
test_size = int(len(dfc) - train_size - validate_size)
validate_end_index = train_size + validate_size

# split into train, validation, test
train = dfc[: train_size]
validate = dfc[train_size : validate_end_index]
test = dfc[validate_end_index : ]

In [None]:
train

In [None]:
for col in train.columns:
    plt.figure(figsize=(12,4))
    plt.plot(train[col])
    plt.plot(validate[col])
    plt.plot(test[col])
    plt.ylabel(col)
    plt.title(col)
    plt.show()

# visuals

In [None]:
y = train['avg_temp']

In [None]:
y.plot.hist()


<div class="alert alert-block alert-info">
<b>Take Aways:</b> alot of cold days.. 25 celsius or 75 has very occasions than 5 c

In [None]:
ax = y.groupby(y.index.week).mean().plot.bar(width = .7)
ax.set(title='Average Temperature by Week In Chicago', xlabel='week', ylabel='Temp (C)')
None

In [None]:
ax = y.groupby(y.index.month).mean().plot.bar(width = .7)
ax.set(title='Average Temperature by Month In Chicago', xlabel='Month', ylabel='Temp (C)')
None



# over time

In [None]:
y.plot()

In [None]:
dfc['wok'] = dfc.index.strftime('%a') 
dfc['mon'] = dfc.index.strftime('%b') 

## resample

In [None]:
y

In [None]:
train.resample('m').mean().plot(title='1 Month Average')


In [None]:
y.resample('3m').mean().plot(title='2 week average')
None

In [None]:
y.resample('M').mean().plot(label='Montly')
y.resample('Y').mean().plot(label='Yearly')
plt.legend()
None

In [None]:
y.resample('M').mean().rolling(3).mean().plot(alpha=.5, label='3 Month')
y.resample('M').mean().rolling(6).mean().plot(alpha=.8, label='6 Month')
plt.title ('Rolling Monthly Average')
plt.ylabel('Temp')
plt.legend()

None


# Last Observed Values

In [None]:
train

In [None]:
avg_temp = train['avg_temp'][-1:][0]
uncertainty = train['uncertainty'][-1:][0]

# functions to help

In [None]:
# Create the empty dataframe
eval_df = pd.DataFrame(columns=['model_type', 'target_var', 'rmse'])


In [None]:
# evaluation function to compute rmse
def evaluate(target_var):
    rmse = round(sqrt(mean_squared_error(validate[target_var], yhat_df[target_var])), 0)
    return rmse


In [None]:
def plot_and_eval(target_var):
    plt.figure(figsize = (12,4))
    plt.plot(train[target_var], label = 'Train', linewidth = 1)
    plt.plot(validate[target_var], label = 'Validate', linewidth = 1)
    plt.plot(yhat_df[target_var])
    plt.title(target_var)
    plt.legend()
    rmse = evaluate(target_var)
    print(target_var, '-- RMSE: {:.0f}'.format(rmse))
    plt.show()

In [None]:
def append_eval_df(model_type, target_var):
    rmse = evaluate(target_var)
    d = {'model_type': [model_type], 'target_var': [target_var], 'rmse': [rmse]}
    d = pd.DataFrame(d)
    return eval_df.append(d, ignore_index = True)


# prediction table

In [None]:
yhat_df = pd.DataFrame({'avg_temp': [avg_temp], 'uncertainty': [uncertainty]}, 
                       index = validate.index)

yhat_df.head(2)

In [None]:
 plot_and_eval(['avg_temp','uncertainty'])

# eval

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type = 'last_observed_value', 
                             target_var = col)

In [None]:
eval_df

# simple average

In [None]:
avg_temp = round(train['avg_temp'].mean(),2)
uncertainty = round(train['uncertainty'].mean(),2)

In [None]:
yhat_df = pd.DataFrame({'avg_temp': [avg_temp],
                            'uncertainty': [uncertainty],
                           }, index = validate.index)

In [None]:
yhat_df.head(1)

# actual vs predict values

In [None]:
for col in train.columns:
    plot_and_eval(col)

# evaluate

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type='simple_average', 
                             target_var = col)

In [None]:
eval_df

# moving average

In [None]:
# compute a 30 day rolling average, 
# use the most recent/last 30 day period value to predict forward. 

period = 30

avg_temp = round(train['avg_temp'].rolling(period).mean().iloc[-1], 2)
uncertainty = round(train['uncertainty'].rolling(period).mean().iloc[-1], 2)

In [None]:
# yhat_df = make_predictions()

yhat_df = pd.DataFrame({'avg_temp': [avg_temp],
                            'uncertainty': [uncertainty],
                           }, index = validate.index)
yhat_df.head(2)

In [None]:
for col in train.columns:
    plot_and_eval(col)

# evaluate

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type='30d moving average', 
                             target_var = col)
eval_df

In [None]:
train

In [None]:
periods = [1, 4, 12, 26, 52, 104]

for p in periods:
    avg_temp = round(train['avg_temp'].rolling(p).mean().iloc[-1], 2)
    uncertainty = round(train['uncertainty'].rolling(p).mean().iloc[-1], 2)

    yhat_df = pd.DataFrame({'avg_temp': [avg_temp],
                            'uncertainty': [uncertainty],
                           }, index = validate.index)

    model_type = str(p) + 'd moving average'
    eval_df = append_eval_df(model_type = model_type,
                             target_var = 'avg_temp'
                            )
    eval_df = append_eval_df(model_type = model_type,
                             target_var = 'uncertainty'
                            )

In [None]:
eval_df

In [None]:
# # get the min rmse for each variable

min_rmse_dollars_sold = eval_df.groupby('target_var')['rmse'].min()[0]
min_rmse_items_sold = eval_df.groupby('target_var')['rmse'].min()[1]

# filter only the rows that match those rmse to find out 
# which models are best thus far
eval_df[((eval_df.rmse == min_rmse_dollars_sold) | 
         (eval_df.rmse == min_rmse_items_sold)
        )]

In [None]:
import statsmodels.api as sm

for col in train.columns:
    print(col,'\n')
    _ = sm.tsa.seasonal_decompose(train[col].resample('m').mean()).plot()
    plt.show()

# basic holt's linear trend

In [None]:
for col in train.columns:
    model = Holt(train[col], exponential = False)
    model = model.fit(smoothing_level = .1, 
                      smoothing_slope = .1, 
                      optimized = False)
    yhat_items = model.predict(start = validate.index[0], 
                               end = validate.index[-1])
    yhat_df[col] = round(yhat_items, 2)

In [None]:
for col in train.columns:
    plot_and_eval(target_var = col)

# evaluate

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type = 'Holts', 
                             target_var = col)
eval_df

In [None]:
# f

# resplit data


In [None]:
dfc.index.max()

In [None]:
dfc = dfc.drop(columns = ['wok','mon'])

In [None]:
train = dfc[:'2010'] # 2015
validate = dfc['2011'] # 2016
test = dfc['2012'] # 2017

# make predictions

In [None]:
train['2011']

In [None]:
yhat_df = train['2011'] + train.diff(365).mean()

In [None]:
pd.concat([yhat_df.head(1), validate.head(1)])

In [None]:
yhat_df.index = validate.index

len(yhat_df)

In [None]:
for col in train.columns:
    plot_and_eval(target_var = col)
    eval_df = append_eval_df(model_type = 'previous year', target_var = col)


In [None]:
eval_df


In [None]:
# get the min rmse for each variable

min_avg_temp_rmse = eval_df.groupby('target_var')['rmse'].min()[0]
min_uncertainty_rmse = eval_df.groupby('target_var')['rmse'].min()[1]

# filter only the rows that match those rmse to find out 
# which models are best thus far
eval_df[((eval_df.rmse == min_avg_temp_rmse) | 
         (eval_df.rmse == min_uncertainty_rmse)
        )]


In [None]:
for col in train.columns:
    x = eval_df[eval_df.target_var == col]['model_type']
    y = eval_df[eval_df.target_var == col]['rmse']
    plt.figure(figsize=(12, 6))
    sns.barplot(x=x, y=y)
    plt.title(col)
    plt.ylabel('RMSE')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
train

In [None]:
yhat_df = validate + train.diff(365).mean()
yhat_df.index = test.index


In [None]:
test

In [None]:
avg_temp = round(sqrt(mean_squared_error(test['avg_temp'], yhat_df['avg_temp'])), 0)
uncertainty = round(sqrt(mean_squared_error(test['uncertainty'], yhat_df['uncertainty'])), 0)


In [None]:
# Plot and evaluate train, validate, and test
def plot_and_eval_test(target_var):
    plt.figure(figsize = (12,4))
    plt.plot(train[target_var], label = 'Train', linewidth = 1)
    plt.plot(validate[target_var], label = 'Validate', linewidth = 1)
    plt.plot(test[target_var], label = 'Test', linewidth = 1)
    plt.plot(yhat_df[target_var], alpha = .5, color="red")
    plt.title(target_var)
    plt.legend()
    plt.show()

In [None]:
print("rmse - avg_temp: ", avg_temp )
print("rmse - uncertainty: ", uncertainty)

for col in train.columns:
    plot_and_eval_test(col)
