In [665]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import pickle
import os
import glob

In [666]:
data = pd.read_csv(r'data/joined_hold_times_data.csv', index_col=0)

In [667]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(data.drop('hold_time', axis=1), data['hold_time'], random_state = 123)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(12332, 9) (12332,) (4111, 9) (4111,)


In [668]:
# Dropping difficulty as difficulty should be encoded by splitting piece counts up by difficulty
X_train = X_train.drop(['diff_0', 'diff_1'], axis=1)
X_test = X_test.drop(['diff_0', 'diff_1'], axis=1)

In [669]:
# Calculate global distribution info for each puzzle
full_training_set = X_train.copy()
full_training_set['hold_time'] = y_train.copy()
hold_summary_by_pack = full_training_set.groupby(by=['pack_name'])['hold_time'].describe()

# Some packs only have 1 data point so std dev is NaN, fill with avg std dev from the entire set
hold_summary_by_pack['std'] = hold_summary_by_pack['std'].fillna(hold_summary_by_pack['std'].mean())

# There are likely going to be instances in the test set where we don't have data for a pack, use the global averages for now
# TODO come up with a more sophisticated way to handle packs we don't have data for
hold_summary_by_pack

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
pack_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Anne Belle Thompson The Mikado Anne Belle Thompson Two Kimonos,4.0,10.494849,6.145555,4.800741,6.654830,9.152296,12.992314,18.874063
Anthology Puzzles Alphonse Mucha La Plume Anthology Puzzles Fun at the Fair,1.0,2.925217,16.430787,2.925217,2.925217,2.925217,2.925217,2.925217
Anthology Puzzles Colorful Cat Anthology Puzzles Life On The Reef,2.0,0.768982,0.108309,0.692396,0.730689,0.768982,0.807275,0.845568
Anthology Puzzles Framed American Gothic Anthology Puzzles Haeckel Floral,5.0,4.149160,2.275166,1.747891,1.835024,5.025721,5.217854,6.919311
Anthology Puzzles Over The Moon Anthology Puzzles Happy House Plants,6.0,7.002811,6.154129,1.836479,2.205024,4.390265,12.035139,15.339815
...,...,...,...,...,...,...,...,...
Whimsy Puzzles Frank Champine Magic Forest Whimsy Puzzles Radiant Butterflies,4.0,11.567170,7.141258,6.826494,6.846846,8.754156,13.474480,21.933875
Whimsy Wood Puzzles An Ocean Party,16.0,11.257278,8.549196,0.473342,4.602426,10.286215,15.445802,31.020209
Zen Puzzles Basket Of Love Zen Puzzles Castle Hill,2.0,10.684384,9.789004,3.762513,7.223448,10.684384,14.145319,17.606255
Zen Puzzles Found Love Crow Prints Puzzles Portobello Road,27.0,12.399005,14.811545,0.944715,3.387486,5.124138,14.764398,65.745293


In [670]:
# Join the training data with the per pack info, just going to use mean and std for now
# TODO try out other variations on pack hold-time distribution information
X_train = pd.merge(X_train, hold_summary_by_pack[['std', 'mean']], left_on='pack_name', right_index=True, how='left')
X_train['pack_hold_time_std'] = X_train['std']
X_train['pack_hold_time_mean'] = X_train['mean']
X_train = X_train.drop(['std', 'mean'], axis=1)



In [671]:
# Add pack hold_time avg and std dev from training set to test set data
X_test = pd.merge(X_test, hold_summary_by_pack[['std', 'mean']], left_on='pack_name', right_index=True, how='left')
X_test['pack_hold_time_std'] = X_test['std']
X_test['pack_hold_time_mean'] = X_test['mean']
X_test = X_test.drop(['mean', 'std'], axis=1)

# For packs from the test set with pack hold time data, fill with the means from the hold_time summary created using only training data
X_test['pack_hold_time_mean'] = X_test['pack_hold_time_mean'].fillna(hold_summary_by_pack['mean'].mean())
X_test['pack_hold_time_std'] = X_test['pack_hold_time_std'].fillna(hold_summary_by_pack['std'].mean())
X_test

Unnamed: 0,member,pack_name,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles,pack_hold_time_std,pack_hold_time_mean
7439,member318,Ecru Puzzles Jethro Buck Forest Ecru Puzzles A...,0.0,603.0,0.0,0.0,2.0,15.879820,12.446043
15750,member641,Artifact Puzzles Seurat Grande Jatte Artifact ...,0.0,0.0,0.0,428.0,1.0,20.472556,23.264691
12206,member509,Artifact Puzzles Angie Rees The Skating Party ...,0.0,985.0,0.0,0.0,2.0,18.892207,17.502347
16011,member655,Artifact Puzzles Angie Rees The Skating Party ...,0.0,985.0,0.0,0.0,2.0,18.892207,17.502347
12971,member537,Artifact Puzzles Erin Hanson Crystal Grove Art...,0.0,0.0,460.0,357.0,2.0,19.866709,19.241807
...,...,...,...,...,...,...,...,...,...
3775,member162,Liberty Puzzles Agra Liberty Puzzles Ditz Patc...,0.0,1170.0,0.0,0.0,2.0,27.185663,27.491171
3126,member137,Nautilus Puzzles Marin Headlands,0.0,475.0,0.0,0.0,1.0,29.289260,22.324845
6287,member275,Wentworth Puzzles Lars Stewart Indian Pillow M...,0.0,1000.0,0.0,0.0,1.0,13.639163,12.108341
12602,member525,Artifact Puzzles Manet Crystal Vase Artifact P...,0.0,331.0,0.0,0.0,2.0,7.882210,8.387194


In [672]:
record_counts = X_train.member.value_counts()
record_counts

member557    130
member474    113
member40      94
member608     92
member414     85
            ... 
member2        1
member87       1
member419      1
member186      1
member80       1
Name: member, Length: 645, dtype: int64

In [673]:
MIN_HISTORY = 75 # Minimum number of data points for a member to get their own model
# 3.6 / 13.2 @ 100
# 3.5 / 12.9 @ 75
# 5.5 / 14.2 @ 50
# 8.5 / 17.2 @ 25
# 11.6 / 22.3 @ 10

In [674]:
# Fit Scaler on all training data
# TODO Try out other scaler
scaler = StandardScaler()
scaler = scaler.fit(X_train.drop(['member', 'pack_name'], axis=1))

In [675]:
def train_linear_regression(X, y, scaler):
    '''
    Trains linear regression model using the given X, y, and scaler
    
    Returns the trained model
    '''
    
    lr = linear_model.LinearRegression()
    
    # Scale X
    #X_s = scaler.transform(X)
    X_s = X.to_numpy()
    # Fit linear regression model
    lr.fit(X_s, y)
    
    return lr

In [676]:
# For each member with more than MIN_HISTORY data points, train a linear regression model for them and save
# Filter to members with more than MIN_HISTORY points
members = record_counts[record_counts >= MIN_HISTORY]
for member, c in members.items():
    X_train_m = X_train[X_train['member'] == member]
    y_train_m = y_train[X_train['member'] == member]
    
    m = train_linear_regression(X_train_m.drop(['member', 'pack_name'], axis=1), y_train_m, scaler)
    
    # Save model for later use
    with open(f'user_models/{member}.pkl', 'wb') as f:
        pickle.dump(m, f)

In [677]:
# Create a overall model on all data points for use on users without MIN_HISTORY data points
overall_m = train_linear_regression(X_train.drop(['member', 'pack_name'], axis=1), y_train, scaler)
with open(f'user_models/universal.pkl', 'wb') as f:
    pickle.dump(overall_m, f)

In [678]:
# Create model just on users with less than MIN_HISTORY data points as a "new puzzler" model
newps = record_counts[record_counts < MIN_HISTORY].index
X_train_newps = X_train[X_train['member'].isin(newps)]
y_train_newps = y_train[X_train['member'].isin(newps)]
newps_m = train_linear_regression(X_train_newps.drop(['member', 'pack_name'], axis=1), y_train_newps, scaler)
with open(f'user_models/newps.pkl', 'wb') as f:
    pickle.dump(newps_m, f)

In [679]:
def make_pred(X, member, default):
    '''
    args:
        - X - scaled input data
        - member - member string
        - default - default model to use
        
    Look up the proper model and use it, if not use default model
    
    Returns the (predicted hold time, the model used)
    '''
    
    # Check if trained model for member exists
    path = f'user_models/{member}.pkl'

    if os.path.exists(path):
        m = pickle.load(open(path, 'rb'))
        
        return (m.predict(X), member)
        
    else:
        # Use default model
        return (default.predict(X), "default")

In [680]:
# Go through test set and either use the per user model or the universal model
univ_model = pickle.load(open('user_models/universal.pkl', 'rb'))
newps_model = pickle.load(open('user_models/newps.pkl', 'rb'))
# Scale test data using scaler fit on the training data

X_test_members = X_test['member']
X_test_packs = X_test['pack_name']
#X_test_scaled = scaler.transform(X_test.drop(['member', 'pack_name'], axis=1))
X_test_scaled = X_test.drop(['member', 'pack_name'], axis=1).to_numpy()

In [681]:
y_pred_universal, y_pred_universal_models = zip(*[make_pred(x.reshape(1, -1), m, univ_model) for x, m in zip(X_test_scaled, X_test_members)])
y_pred_universal = np.array(list(y_pred_universal))
y_pred_universal_models = np.array(list(y_pred_universal_models))
mse_univ = mean_squared_error(y_test, y_pred_universal)
mae_univ = mean_absolute_error(y_test, y_pred_universal)
print(f'Using universal default combined mse: {mse_univ}, mae: {mae_univ}')

mse_univ_usermodel = mean_squared_error(y_test[y_pred_universal_models != "default"], y_pred_universal[y_pred_universal_models != "default"])
mae_univ_usermodel = mean_absolute_error(y_test[y_pred_universal_models != "default"], y_pred_universal[y_pred_universal_models != "default"])
print(f'Using universal, user mse: {mse_univ_usermodel}, mae: {mae_univ_usermodel}')

mse_univ_default = mean_squared_error(y_test[y_pred_universal_models == "default"], y_pred_universal[y_pred_universal_models == "default"])
mae_univ_default = mean_absolute_error(y_test[y_pred_universal_models == "default"], y_pred_universal[y_pred_universal_models == "default"])
print(f'Using universal, default mse: {mse_univ_default}, mae: {mae_univ_default}')

Using universal default combined mse: 377.49207713610724, mae: 12.178648599569012
Using universal, user mse: 27.5143300799169, mae: 3.6362056057127092
Using universal, default mse: 407.3344246279629, mae: 12.907056489488646


In [682]:
y_pred_newps, y_pred_newps_models = zip(*[make_pred(x.reshape(1, -1 ), m, newps_model) for x, m in zip(X_test_scaled, X_test_members)])
y_pred_newps = np.array(list(y_pred_newps))
y_pred_newps_models = np.array(list(y_pred_newps_models))
mse_newps = mean_squared_error(y_test, y_pred_newps)
mae_newps = mean_absolute_error(y_test, y_pred_newps)
print(f'Using newps default combined mse: {mse_newps}, mae: {mae_newps}')

mse_newps_usermodel = mean_squared_error(y_test[y_pred_newps_models != "default"], y_pred_newps[y_pred_newps_models != "default"])
mae_newps_usermodel = mean_absolute_error(y_test[y_pred_newps_models != "default"], y_pred_newps[y_pred_newps_models != "default"])
print(f'Using newps, user mse: {mse_newps_usermodel}, mae: {mae_newps_usermodel}')

mse_newps_default = mean_squared_error(y_test[y_pred_newps_models == "default"], y_pred_newps[y_pred_newps_models == "default"])
mae_newps_default = mean_absolute_error(y_test[y_pred_newps_models == "default"], y_pred_newps[y_pred_newps_models == "default"])
print(f'Using newps, default mse: {mse_newps_default}, mae: {mae_newps_default}')


Using newps default combined mse: 378.41530167489776, mae: 12.436127643090078
Using newps, user mse: 27.5143300799169, mae: 3.6362056057127092
Using newps, default mse: 408.3363718504993, mae: 13.186490583447227


In [683]:
file = glob.glob('user_models/*')
for f in file:
    #os.remove(f)
    pass

In [684]:
y_pred_universal_models[y_pred_universal_models != "default"]

array(['member292', 'member40', 'member557', 'member402', 'member608',
       'member117', 'member40', 'member402', 'member363', 'member414',
       'member292', 'member279', 'member474', 'member557', 'member292',
       'member557', 'member324', 'member414', 'member363', 'member324',
       'member324', 'member40', 'member324', 'member474', 'member557',
       'member414', 'member474', 'member608', 'member292', 'member279',
       'member363', 'member279', 'member117', 'member557', 'member414',
       'member279', 'member363', 'member608', 'member324', 'member363',
       'member363', 'member474', 'member474', 'member292', 'member292',
       'member474', 'member292', 'member324', 'member608', 'member474',
       'member414', 'member279', 'member324', 'member608', 'member363',
       'member557', 'member363', 'member40', 'member414', 'member414',
       'member363', 'member557', 'member557', 'member279', 'member292',
       'member414', 'member40', 'member608', 'member363', 'member474