In [117]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [118]:
data = pd.read_csv(r'data/joined_hold_times_data.csv', index_col=0)
data

Unnamed: 0,member,hold_time,pack_name,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles,diff_0,diff_1
0,member1,2.939411,Artifact Puzzles Justin Hillgrove Word Travels...,456.0,548.0,0.0,0.0,2.0,A-Easy,Average
1,member1,0.998885,DaVici Puzzles Full Moon Feast DaVici Puzzles ...,0.0,0.0,415.0,0.0,2.0,Hard,Hard
2,member1,10.865032,DaVici Puzzles Flying Frigate DaVici Puzzles H...,295.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy
3,member1,22.083971,Liberty Puzzles Haeckel Hummingbirds Nautilus ...,0.0,707.0,0.0,0.0,2.0,Average,Average
4,member1,5.077603,DaVici Puzzles Diana Zimens City Of Cats,0.0,700.0,0.0,0.0,1.0,Average,Average
...,...,...,...,...,...,...,...,...,...,...
16438,member675,9.644028,Artifact Puzzles Erin Hanson Dawning Saguaro A...,0.0,831.0,0.0,0.0,2.0,Average,Average
16439,member675,51.663386,Liberty Puzzle Dr. Seuss Plethora of Cats Libe...,616.0,0.0,545.0,0.0,2.0,Hard,A-Easy
16440,member675,16.660733,Liberty Puzzles William Groppers American Folk...,0.0,692.0,0.0,0.0,1.0,Average,Average
16441,member675,26.596725,Liberty Puzzles Vincent Van Gogh Flowering Gar...,355.0,386.0,0.0,0.0,2.0,Average,A-Easy


In [119]:
# Split train/test

X_train, X_test, y_train, y_test = train_test_split(data.drop('hold_time', axis=1), data['hold_time'])

X_train

Unnamed: 0,member,pack_name,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles,diff_0,diff_1
12431,member515,Dowdle Puzzles New Orleans Dowdle Puzzles Nati...,280.0,0.0,280.0,0.0,2.0,Hard,A-Easy
11394,member474,Anthology Puzzles Framed American Gothic Antho...,0.0,620.0,0.0,0.0,2.0,Average,Average
8235,member343,Artifact Puzzles Joe Vaux Garden Of Earthly De...,0.0,619.0,0.0,0.0,1.0,Average,Average
16429,member675,Snowflake Puzzles Hiroshige Ando Minowa Kanasu...,1000.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy
9096,member378,Nautilus Puzzles Stan The Man,250.0,0.0,0.0,0.0,1.0,A-Easy,Average
...,...,...,...,...,...,...,...,...,...
4665,member211,DaVici Good Morning DaVici The Life of Birds,315.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy
1264,member63,Artifact Puzzles Aaron Wolf Cathedral Of The C...,0.0,0.0,679.0,0.0,2.0,Hard,Hard
1132,member57,Fools Gold Baffling Butterflies Joyful Nook Ga...,0.0,253.0,0.0,0.0,2.0,Average,Average
10868,member448,Liberty Puzzles United Nations Tour of Jordan,439.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy


In [120]:
X_train['hold_time'] = y_train

X_train

Unnamed: 0,member,pack_name,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles,diff_0,diff_1,hold_time
12431,member515,Dowdle Puzzles New Orleans Dowdle Puzzles Nati...,280.0,0.0,280.0,0.0,2.0,Hard,A-Easy,16.972554
11394,member474,Anthology Puzzles Framed American Gothic Antho...,0.0,620.0,0.0,0.0,2.0,Average,Average,5.217854
8235,member343,Artifact Puzzles Joe Vaux Garden Of Earthly De...,0.0,619.0,0.0,0.0,1.0,Average,Average,65.764076
16429,member675,Snowflake Puzzles Hiroshige Ando Minowa Kanasu...,1000.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy,16.692262
9096,member378,Nautilus Puzzles Stan The Man,250.0,0.0,0.0,0.0,1.0,A-Easy,Average,18.761664
...,...,...,...,...,...,...,...,...,...,...
4665,member211,DaVici Good Morning DaVici The Life of Birds,315.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy,4.315103
1264,member63,Artifact Puzzles Aaron Wolf Cathedral Of The C...,0.0,0.0,679.0,0.0,2.0,Hard,Hard,13.657983
1132,member57,Fools Gold Baffling Butterflies Joyful Nook Ga...,0.0,253.0,0.0,0.0,2.0,Average,Average,3.797984
10868,member448,Liberty Puzzles United Nations Tour of Jordan,439.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy,4.118566


In [121]:
# First just try average hold time by member
# Shift the hold times forward so the average isn't containing the hold_time, lets just assume default 1 week hold
X_train['last_hold_time'] = X_train.groupby('member')['hold_time'].shift(1, fill_value = 7)
X_train['avg_hold_time'] = X_train.groupby('member')['last_hold_time'].rolling(5, 1).mean().reset_index(0,drop=True) # Try summary instead of rolling, also def are introducing some temporal ordering here which probs doesn't exist, if only we had dates
X_train['avg_hold_time_for_puzzle'] = X_train.groupby('pack_name')['hold_time'].rolling(5, 1).mean().reset_index(0, drop=True) 
X_train = X_train.drop('hold_time', axis=1)
X_train

Unnamed: 0,member,pack_name,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles,diff_0,diff_1,last_hold_time,avg_hold_time,avg_hold_time_for_puzzle
12431,member515,Dowdle Puzzles New Orleans Dowdle Puzzles Nati...,280.0,0.0,280.0,0.0,2.0,Hard,A-Easy,7.000000,7.000000,16.972554
11394,member474,Anthology Puzzles Framed American Gothic Antho...,0.0,620.0,0.0,0.0,2.0,Average,Average,7.000000,7.000000,5.217854
8235,member343,Artifact Puzzles Joe Vaux Garden Of Earthly De...,0.0,619.0,0.0,0.0,1.0,Average,Average,7.000000,7.000000,65.764076
16429,member675,Snowflake Puzzles Hiroshige Ando Minowa Kanasu...,1000.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy,7.000000,7.000000,16.692262
9096,member378,Nautilus Puzzles Stan The Man,250.0,0.0,0.0,0.0,1.0,A-Easy,Average,7.000000,7.000000,18.761664
...,...,...,...,...,...,...,...,...,...,...,...,...
4665,member211,DaVici Good Morning DaVici The Life of Birds,315.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy,36.075313,30.712858,15.064014
1264,member63,Artifact Puzzles Aaron Wolf Cathedral Of The C...,0.0,0.0,679.0,0.0,2.0,Hard,Hard,31.043580,50.944526,9.822587
1132,member57,Fools Gold Baffling Butterflies Joyful Nook Ga...,0.0,253.0,0.0,0.0,2.0,Average,Average,10.015381,12.024787,2.330320
10868,member448,Liberty Puzzles United Nations Tour of Jordan,439.0,0.0,0.0,0.0,2.0,A-Easy,A-Easy,15.683871,23.453925,2.549276


In [122]:
# TODO Normalize the data

In [123]:
# TODO maybe keep the difficulties one hot encoded, try with dropping them
X_train_drop = X_train.drop(['diff_0', 'diff_1', 'member', 'pack_name', 'last_hold_time'], axis=1)
X_train_drop

Unnamed: 0,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles,avg_hold_time,avg_hold_time_for_puzzle
12431,280.0,0.0,280.0,0.0,2.0,7.000000,16.972554
11394,0.0,620.0,0.0,0.0,2.0,7.000000,5.217854
8235,0.0,619.0,0.0,0.0,1.0,7.000000,65.764076
16429,1000.0,0.0,0.0,0.0,2.0,7.000000,16.692262
9096,250.0,0.0,0.0,0.0,1.0,7.000000,18.761664
...,...,...,...,...,...,...,...
4665,315.0,0.0,0.0,0.0,2.0,30.712858,15.064014
1264,0.0,0.0,679.0,0.0,2.0,50.944526,9.822587
1132,0.0,253.0,0.0,0.0,2.0,12.024787,2.330320
10868,439.0,0.0,0.0,0.0,2.0,23.453925,2.549276


In [124]:
X_train_drop[X_train_drop.isna().any(axis=1)]

Unnamed: 0,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles,avg_hold_time,avg_hold_time_for_puzzle


In [125]:
lasso = linear_model.LinearRegression()
lasso.fit(X_train_drop, y_train)

In [126]:
print(lasso.coef_)
print(lasso.intercept_)

[ 7.34022108e-04  1.52228220e-03  1.13033735e-03  1.96729118e-03
 -7.67002147e-02  4.88635250e-01  8.91735905e-01]
-5.994657841715533


In [127]:
# Do the same avg stuff for test set
X_test['hold_time'] = y_test
X_test['last_hold_time'] = X_test.groupby('member')['hold_time'].shift(1, fill_value = 7)
X_test['avg_hold_time'] = X_test.groupby('member')['last_hold_time'].rolling(5, 1).mean().reset_index(0,drop=True)
X_test['avg_hold_time_for_puzzle'] = X_test.groupby('pack_name')['hold_time'].rolling(5, 1).mean().reset_index(0, drop=True) # In reality we should take this summary from the training set unless we're planning on updating the features frequently in real life
X_test = X_test.drop('hold_time', axis=1)
X_test = X_test.drop(['diff_0', 'diff_1', 'member', 'pack_name', 'last_hold_time'], axis=1) # Dropping last hold time since it seemed like too much ordering going on, I mean even with the avg we're assuming what came first but having a single point as last hold time seemed too much
X_test

Unnamed: 0,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles,avg_hold_time,avg_hold_time_for_puzzle
3057,300.0,0.0,447.0,0.0,2.0,7.000000,46.007054
12225,250.0,185.0,0.0,0.0,2.0,7.000000,6.114592
13667,0.0,0.0,460.0,357.0,2.0,7.000000,7.681730
6763,0.0,0.0,1098.0,0.0,1.0,7.000000,7.989555
15058,0.0,85.0,91.0,0.0,2.0,7.000000,40.453769
...,...,...,...,...,...,...,...
14721,0.0,755.0,0.0,0.0,2.0,5.744571,5.825542
13007,947.0,0.0,0.0,0.0,2.0,15.752549,11.584237
13403,250.0,500.0,0.0,0.0,2.0,8.558721,9.398510
5799,0.0,403.0,262.0,0.0,2.0,12.012858,8.090728


In [128]:
lasso.score(X_test, y_test)

0.46929114187328247

In [129]:
y_pred = lasso.predict(X_test)
mean_squared_error(y_test, y_pred)

178.02475410006625

In [130]:
mean_absolute_error(y_test, y_pred)

8.498419676056459

In [131]:
y_train_avg = y_train.mean()
y_train_avg

16.183905241623517

In [132]:
y_avg_pred = [y_train_avg] * len(y_test)
mean_absolute_error(y_test, y_avg_pred)

12.381576591886168