In [1]:
!which python

/Users/ahmadabdullahtariq/Documents/Projects/Thesis/.venv/bin/python


In [1]:
import pandas as pd
import numpy as np 
import plotly.graph_objects as go

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

# Linear Regression
- Load Data
- Data Insites
- Visualize Data 
- Linear Regression without normalization 
- Linear Regression with normalization

In [2]:
df = pd.read_pickle('../dataset/preprocessed/284_imputed.pkl')

In [3]:
df.head()

Unnamed: 0,DOA,LeadTime,DOW,Price,TotalCapacity,Reservations
0,2018-01-01,88,0,76.05,290.0,58.0
1,2018-01-01,87,0,76.05,290.0,58.0
2,2018-01-01,85,0,76.05,290.0,58.0
3,2018-01-01,81,0,76.05,290.0,63.0
4,2018-01-01,80,0,76.05,290.0,62.0


In [8]:
df.describe(include='all')

Unnamed: 0,DOA,LeadTime,DOW,Price,TotalCapacity,Reservations,Occ
count,60387,60387.0,60387.0,60387.0,60387.0,60387.0,60387.0
unique,719,,,,,,
top,2019-02-04 00:00:00,,,,,,
freq,90,,,,,,
first,2018-01-01 00:00:00,,,,,,
last,2019-12-31 00:00:00,,,,,,
mean,,43.56802,2.999851,77.71663,290.0,129.44647,0.446367
std,,26.012538,2.003074,39.163356,0.0,71.517101,0.246611
min,,0.0,0.0,30.24,290.0,0.0,0.0
25%,,21.0,1.0,53.82,290.0,72.0,0.248276


In [6]:
df['Occ'] = df['Reservations']/df['TotalCapacity']

In [11]:
df['shift'] = df['Occ'].shift(periods=1)

In [12]:
df.corr()

Unnamed: 0,LeadTime,DOW,Price,TotalCapacity,Reservations,Occ,shift
LeadTime,1.0,0.000286,0.013981,,-0.818509,-0.818509,-0.739301
DOW,0.000286,1.0,-0.030063,,0.070858,0.070858,0.073138
Price,0.013981,-0.030063,1.0,,0.142387,0.142387,0.146199
TotalCapacity,,,,,,,
Reservations,-0.818509,0.070858,0.142387,,1.0,1.0,0.943522
Occ,-0.818509,0.070858,0.142387,,1.0,1.0,0.943522
shift,-0.739301,0.073138,0.146199,,0.943522,0.943522,1.0


In [13]:
def curve(df):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.LeadTime, y=df.Reservations,mode='markers',name='Price'))
    fig.show()   

def days_of_week(df):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.DOW, y=df.Reservations,mode='markers',name='Price'))
    fig.show()   

In [14]:
dx = df[df['LeadTime']==89]

fig = go.Figure()
fig.add_trace(go.Scatter(x=dx.DOA, y=dx.Reservations,mode='markers+lines',name='Reservations'))
fig.update_layout(autosize=False,width=1500,height=400,xaxis_title="Reservations on Hand (ROH)",yaxis_title="Day of Arrival (DOA)")
fig.show()   

In [7]:
# curve(df[(df['DOW']==2)&(df['DOA']>='2019-11-01')])
# days_of_week(df[(df['DOA']>='2019-09-01')])

## Linear Regression without normalizaition

In [15]:
df = pd.read_pickle('../dataset/preprocessed/284_imputed.pkl')
df = df.drop(columns=['TotalCapacity'])
df = df.set_index('DOA')
df.head()

Unnamed: 0_level_0,LeadTime,DOW,Price,Reservations
DOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01,88,0,76.05,58.0
2018-01-01,87,0,76.05,58.0
2018-01-01,85,0,76.05,58.0
2018-01-01,81,0,76.05,63.0
2018-01-01,80,0,76.05,62.0


In [16]:
def polynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),LinearRegression(**kwargs))

In [17]:
train  = df.loc['2019-05':'2019-09']
test  = df.loc['2019-10':]

y = train.pop('Reservations')
X = train

In [18]:
from sklearn.model_selection import cross_validate

scoring = {'r2':'r2', 'MSE':'neg_mean_squared_error'}
for degree in range(2,7):
    res = cross_validate(polynomialRegression(degree), X, y, cv=270,scoring=scoring, return_train_score=True)
    
    print("Poly degree:",degree)
    print("R2  Test Score: %0.2f (+/- %0.2f)) | Train Score: %0.2f (+/- %0.2f))"%(res['test_r2'].mean(),res['test_r2'].std()* 2,res['train_r2'].mean(),res['train_r2'].std()* 2))
    print("MSE Test Score: %0.2f (+/- %0.2f)) | Train Score: %0.2f (+/- %0.2f))"%(res['test_MSE'].mean(),res['test_MSE'].std()* 2,res['train_MSE'].mean(),res['train_MSE'].std()* 2))
    print("RMSE Test Score: %0.2f (+/- %0.2f)) | Train Score: %0.2f (+/- %0.2f))"%(np.sqrt(-res['test_MSE']).mean(),np.sqrt(-res['test_MSE']).std()* 2,np.sqrt(-res['train_MSE']).mean(),np.sqrt(-res['test_MSE']).std()* 2))

Poly degree: 2
R2  Test Score: -0.87 (+/- 16.11)) | Train Score: 0.73 (+/- 0.00))
MSE Test Score: -1464.63 (+/- 2789.63)) | Train Score: -1411.86 (+/- 10.15))
RMSE Test Score: 34.29 (+/- 34.00)) | Train Score: 37.57 (+/- 34.00))
Poly degree: 3
R2  Test Score: -0.82 (+/- 15.40)) | Train Score: 0.75 (+/- 0.00))
MSE Test Score: -1429.81 (+/- 2884.91)) | Train Score: -1310.90 (+/- 9.76))
RMSE Test Score: 33.58 (+/- 34.77)) | Train Score: 36.21 (+/- 34.77))
Poly degree: 4
R2  Test Score: -0.79 (+/- 14.40)) | Train Score: 0.77 (+/- 0.00))
MSE Test Score: -1453.68 (+/- 3966.42)) | Train Score: -1224.33 (+/- 9.61))
RMSE Test Score: 33.17 (+/- 37.61)) | Train Score: 34.99 (+/- 37.61))
Poly degree: 5
R2  Test Score: -0.82 (+/- 13.85)) | Train Score: 0.78 (+/- 0.00))
MSE Test Score: -1463.85 (+/- 4616.69)) | Train Score: -1173.84 (+/- 9.60))
RMSE Test Score: 33.06 (+/- 38.53)) | Train Score: 34.26 (+/- 38.53))
Poly degree: 6
R2  Test Score: -1.10 (+/- 15.99)) | Train Score: 0.78 (+/- 0.00))
MSE T

In [25]:
model = polynomialRegression(5).fit(X, y)

In [26]:
X_test = test.loc[:, test.columns != 'Reservations']
y_test = test['Reservations']

In [27]:
y_pred = model.predict(X_test)

In [28]:
mean_squared_error(y_test,y_pred)

3537.0014125867233

In [29]:
X_test['True'] = y_test
X_test['Pred'] = y_pred

In [30]:
dx = X_test[X_test['LeadTime']==89]

fig = go.Figure()
fig.add_trace(go.Scatter(x=dx.index, y=dx['True'],mode='markers+lines',name='True'))
fig.add_trace(go.Scatter(x=dx.index, y=dx['Pred'],mode='markers+lines',name='Pred'))

fig.update_layout(autosize=False,width=1500,height=400,xaxis_title="Reservations on Hand (ROH)",yaxis_title="Day of Arrival (DOA)")
fig.show()   

In [31]:
temp = X_test.loc['2019-11-27']
fig = go.Figure()
fig.add_trace(go.Scatter(x=temp['LeadTime'], y=temp['True'],mode='markers',name='True'))
fig.add_trace(go.Scatter(x=temp['LeadTime'], y=temp['Pred'],mode='markers',name='Pred'))
fig.show()   

## With Occupancy

In [32]:
df = pd.read_pickle('../dataset/preprocessed/284_imputed.pkl')
df['Occ'] = df['Reservations']/df['TotalCapacity']
df = df.drop(columns=['TotalCapacity','Reservations'])
df = df.set_index('DOA')

In [33]:
df.head()

Unnamed: 0_level_0,LeadTime,DOW,Price,Occ
DOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01,88,0,76.05,0.2
2018-01-01,87,0,76.05,0.2
2018-01-01,85,0,76.05,0.2
2018-01-01,81,0,76.05,0.217241
2018-01-01,80,0,76.05,0.213793


In [34]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# # df['DOW'] = scaler.fit_transform(np.array(df['DOW']).reshape(-1, 1))
# scaled_data = scaler.fit_transform(df[['DOW','LeadTime']])
# df[['DOW','LeadTime']] = scaled_data

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['Price','LeadTime']])
df[['Price','LeadTime']] = scaled_data
# df['Price'] = scaler.fit_transform(np.array(df['Price']).reshape(-1, 1))

df.head()

Unnamed: 0_level_0,LeadTime,DOW,Price,Occ
DOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01,1.708113,0,-0.042556,0.2
2018-01-01,1.669669,0,-0.042556,0.2
2018-01-01,1.592783,0,-0.042556,0.2
2018-01-01,1.43901,0,-0.042556,0.217241
2018-01-01,1.400566,0,-0.042556,0.213793


In [36]:
def polynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),LinearRegression(**kwargs))

In [37]:
train  = df.loc['2019-05':'2019-09']
test  = df.loc['2019-10':]

y = train.pop('Occ')
X = train

In [38]:
from sklearn.model_selection import cross_validate

scoring = {'r2':'r2', 'MSE':'neg_mean_squared_error'}
for degree in range(2,5):
    res = cross_validate(polynomialRegression(degree), X, y, cv=270,scoring=scoring, return_train_score=True)

    print("Poly degree:",degree)
    print("R2  Test Score: %0.4f (+/- %0.4f)) | Train Score: %0.4f (+/- %0.4f))"%(res['test_r2'].mean(),res['test_r2'].std()* 2,res['train_r2'].mean(),res['train_r2'].std()* 2))
    print("MSE Test Score: %0.4f (+/- %0.4f)) | Train Score: %0.4f (+/- %0.4f))"%(res['test_MSE'].mean(),res['test_MSE'].std()* 2,res['train_MSE'].mean(),res['train_MSE'].std()* 2))


Poly degree: 2
R2  Test Score: -0.8660 (+/- 16.1052)) | Train Score: 0.7327 (+/- 0.0019))
MSE Test Score: -0.0174 (+/- 0.0332)) | Train Score: -0.0168 (+/- 0.0001))
Poly degree: 3
R2  Test Score: -0.8166 (+/- 15.4042)) | Train Score: 0.7518 (+/- 0.0019))
MSE Test Score: -0.0170 (+/- 0.0343)) | Train Score: -0.0156 (+/- 0.0001))
Poly degree: 4
R2  Test Score: -0.7925 (+/- 14.4039)) | Train Score: 0.7682 (+/- 0.0019))
MSE Test Score: -0.0173 (+/- 0.0472)) | Train Score: -0.0146 (+/- 0.0001))


In [39]:
X_test = test.loc[:, test.columns != 'Occ']
y_test = test['Occ']

In [40]:
model = polynomialRegression(4).fit(X, y)

In [41]:
y_pred = model.predict(X_test)

In [42]:
mean_squared_error(y_test,y_pred)

0.028651371225399475

In [43]:
X_test['True'] = y_test
X_test['Pred'] = y_pred

In [44]:
X_test

Unnamed: 0_level_0,LeadTime,DOW,Price,True,Pred
DOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-10-01,1.746556,1,-0.383184,0.255172,0.317066
2019-10-01,1.708113,1,-0.383184,0.255172,0.323969
2019-10-01,1.669669,1,-0.383184,0.255172,0.330336
2019-10-01,1.631226,1,-0.383184,0.255172,0.336191
2019-10-01,1.592783,1,-0.383184,0.255172,0.341561
...,...,...,...,...,...
2019-12-31,-1.521126,1,3.003173,0.934483,0.954127
2019-12-31,-1.559569,1,3.003173,0.972414,0.964995
2019-12-31,-1.598013,1,3.003173,0.962069,0.975417
2019-12-31,-1.636456,1,3.003173,0.986207,0.985368


In [45]:
dx = X_test[X_test['LeadTime']==X_test['LeadTime'].max()]

fig = go.Figure()
fig.add_trace(go.Scatter(x=dx.index, y=dx['True'],mode='markers+lines',name='True'))
fig.add_trace(go.Scatter(x=dx.index, y=dx['Pred'],mode='markers+lines',name='Pred'))

fig.update_layout(autosize=False,width=1500,height=400,xaxis_title="Reservations on Hand (ROH)",yaxis_title="Day of Arrival (DOA)")
fig.show()   

In [46]:
temp = X_test.loc['2019-12-16']
fig = go.Figure()
fig.add_trace(go.Scatter(x=temp['LeadTime'], y=temp['True'],mode='markers',name='True'))
fig.add_trace(go.Scatter(x=temp['LeadTime'], y=temp['Pred'],mode='markers',name='Pred'))
fig.show()   