# Notebook Structure

1. Data Cleaning
        1.1 Members data
        1.2 Packs data
        1.3 Merged data 
        1.4 *Dealing with nulls!!!
        
2. Train-Test-Validation sets

3. Feature Engineering

4. Feature Importance
        4.1 LR Model on complete data
        4.2 DT Model on complete data
        4.3 Table representing MAE
        4.4 Table representing feature importance

5. Final Models per person
        5.1 LR model per person
        5.2 DT model per person
        5.3 RF model per person

# 1. Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
path = '/Users/aishwaryasingh/PuzzleSchedulingProject/puzzle-scheduling/data/'

### 1.1 Members Data

In [3]:
member_holdtime_df = pd.read_csv(path+'member_hold_times_and_packs_Feb16_2023.tsv', 
                                 sep='\t', header=None)
member_holdtime_df.columns = ['memberID', 'holdtime', 'puzzlepack']
member_holdtime_df.shape

(19733, 3)

In [4]:
member_holdtime_df.isnull().sum()

memberID      0
holdtime      0
puzzlepack    0
dtype: int64

In [5]:
member_holdtime_df = member_holdtime_df[(member_holdtime_df['holdtime'] >= 0.1) & 
                                        (member_holdtime_df['holdtime'] <= 150)]

### 1.2 Packs Data

In [6]:
packs_df = pd.read_csv(path + 'packs_Feb16_2023.tsv', sep='\t', header=None)
packs_df.columns = ['pack_name', 'brand', 'piece_count', 'difficulty_rating']
packs_df.shape

(920, 4)

In [7]:
# making two coulmns for piece count
packs_df['piece_count_1'] = packs_df['piece_count'].str.split(',', expand=True)[0]
packs_df['piece_count_2'] = packs_df['piece_count'].str.split(',', expand=True)[1]
# making two columns for difficulty
packs_df['difficulty_rating_1'] = packs_df['difficulty_rating'].str.split(',', expand=True)[0]
packs_df['difficulty_rating_2'] = packs_df['difficulty_rating'].str.split(',', expand=True)[1]

In [8]:
# splitting brand name
packs_df['brand_2'] = packs_df['brand'].str.split(',', expand=True)[1]
packs_df['brand_1'] = packs_df['brand'].str.split(',', expand=True)[0]

In [9]:
# adding number of puzzles feature
packs_df['num_puzzles'] = packs_df['pack_name'].map(lambda n: 1 if (n[-1] == ' ') else 2, na_action='ignore')

In [10]:
# fixing datatype
packs_df = packs_df.astype({'piece_count_1': 'int64', 'piece_count_2': 'int64'}, errors='ignore')

In [11]:
# dropping the initial variables
packs_df.drop(['brand', 'piece_count', 'difficulty_rating'], axis=1, inplace=True)

In [12]:
packs_df.isnull().sum()

pack_name                0
piece_count_1            8
piece_count_2          181
difficulty_rating_1      4
difficulty_rating_2      4
brand_2                775
brand_1                 25
num_puzzles              0
dtype: int64

In [13]:
# Some rows straight up have nothing, drop these as there is nothing to be done
packs_df.dropna(subset = ['difficulty_rating_1', 'difficulty_rating_2', 'piece_count_1', 'piece_count_2', 
                          'brand_1', 'brand_2'], how='all', inplace = True)
packs_df.shape

(916, 8)

In [14]:
# making the brand_2 same as brand_1
# ------ must be done only for puzzle = 2, right?
packs_df.loc[(packs_df['brand_2'].isna()), 'brand_2'] = packs_df['brand_1']

In [15]:
packs_df[['brand_1', 'brand_2']] = packs_df[['brand_1', 'brand_2']].fillna('unknown')

In [16]:
# Some 1 puzzle packs have a second number for pieces, this seems to be legit, updating to be 2 puzzle packs
packs_df.loc[((packs_df.num_puzzles == 1) & (packs_df.piece_count_2.notna())), 'num_puzzles'] = 2

In [17]:
packs_df.piece_count_1 = packs_df.piece_count_1.astype('float').astype('Int64')
packs_df.piece_count_2 = packs_df.piece_count_2.astype('float').astype('Int64')

avg_pc1 = packs_df['piece_count_1'].median()
avg_pc2 = packs_df['piece_count_2'].median()

In [18]:
# puzzles which have no value for piece_1 is replaced with avg values
packs_df['piece_count_1'].fillna(avg_pc1, inplace=True)

In [19]:
# if 2 puzzles and second piececount is not available, replace with average
packs_df.loc[((packs_df['piece_count_2'].isna() ) &(packs_df['num_puzzles'] ==2)), 'piece_count_2'] = int(avg_pc2)

In [20]:
# puzzles with no second piece_count, make second piece count 0 
# (because num_puzzle = 2 was dealt with earlier)
packs_df.loc[(packs_df['piece_count_2'].isna()) & (packs_df['num_puzzles'] ==1), 'piece_count_2'] = int(0)

In [21]:
packs_df.isnull().sum()

pack_name              0
piece_count_1          0
piece_count_2          0
difficulty_rating_1    0
difficulty_rating_2    0
brand_2                0
brand_1                0
num_puzzles            0
dtype: int64

### 1.3 Merged data

In [22]:
df = member_holdtime_df.merge(packs_df, left_on='puzzlepack', right_on='pack_name', how='left')
df.head(2)

Unnamed: 0,memberID,holdtime,puzzlepack,pack_name,piece_count_1,piece_count_2,difficulty_rating_1,difficulty_rating_2,brand_2,brand_1,num_puzzles
0,member1,2.939411,Artifact Puzzles Justin Hillgrove Word Travels...,Artifact Puzzles Justin Hillgrove Word Travels...,456,548,A-Easy,Average,Artifact,Artifact,2.0
1,member1,0.998885,DaVici Puzzles Full Moon Feast DaVici Puzzles ...,DaVici Puzzles Full Moon Feast DaVici Puzzles ...,195,220,A-Easy,Hard,DaVici,DaVici,2.0


In [23]:
df.shape, member_holdtime_df.shape

((19387, 11), (19387, 3))

In [24]:
member_holdtime_df.puzzlepack.nunique(), packs_df.pack_name.nunique(), 

(968, 916)

In [25]:
len(set(member_holdtime_df.puzzlepack.unique()).intersection(set(packs_df.pack_name.unique())))

828

In [26]:
df.isnull().sum()
# about 5% of the data is missing

memberID                  0
holdtime                  0
puzzlepack                0
pack_name              1086
piece_count_1          1086
piece_count_2          1086
difficulty_rating_1    1086
difficulty_rating_2    1086
brand_2                1086
brand_1                1086
num_puzzles            1086
dtype: int64

<div class="alert alert-block alert-info">
<b>Tip:</b> # Ideally we should have information about all the packs. Since we do not have this information and it is recommended that we do not drop these rows, we will impute these with the average values.
</div>

### 1.4 Dealing with Nulls!
**Methodology**

* num_puzzles --> Count number of times 'Puzzle' appears in puzzlepack
* brand_1, brand_2 --> Take word before 'Puzzle' as brand name
* difficulty_rating --> Take average at brand level from known data
* piece_count --> Take average at brand level from known data

In [27]:
missing_pack_data = df.loc[df['pack_name'].isna()][['puzzlepack', 'piece_count_1', 'piece_count_2',
                               'difficulty_rating_1', 'difficulty_rating_2', 'brand_1', 
                                'brand_2', 'num_puzzles']]
missing_pack_data.reset_index(drop=True, inplace=True)

In [28]:
# getting number of puzzles
missing_pack_data['num_puzzles'] = missing_pack_data['puzzlepack'].apply(lambda x: (x.count('Puzzles')))

In [29]:
# getting brand_1
for i in range(0,len(missing_pack_data)):
    words = missing_pack_data['puzzlepack'][i].split()[0:2] 
    if 'Puzzles' in words[1:]: #if its the second or third word
        missing_pack_data['brand_1'][i] = words[words.index('Puzzles')-1]

In [30]:
# getting brand_2
for i in range(0,len(missing_pack_data)):
    words = missing_pack_data['puzzlepack'][i].split()[2:]
    if 'Puzzles' in words[1:]: #if its the 4th word onwards
        missing_pack_data['brand_2'][i] = words[words.index('Puzzles')-1]

In [31]:
missing_pack_data[['num_puzzles', 'brand_1', 'brand_2']].isnull().sum()

num_puzzles      0
brand_1        139
brand_2        333
dtype: int64

In [32]:
# if anything still missing in num_puzzles, brands
missing_pack_data['num_puzzles'].fillna(1, inplace=True)
missing_pack_data['brand_1'].fillna('unknown', inplace=True)
missing_pack_data['brand_2'].fillna('unknown', inplace=True)

In [33]:
# brand level average piece count values -- KNOWN data
groupby_brand_pieces_1 = df.groupby("brand_1")["piece_count_1"].mean()
groupby_brand_pieces_2 = df.groupby("brand_2")["piece_count_2"].mean()

In [34]:
# getting piece_count_1
for i in range(0, len(missing_pack_data)):
    try:
        missing_pack_data['piece_count_1'][i] = int(groupby_brand_pieces_1[missing_pack_data['brand_1'][i]])
        # getting piece_count_2 if needed
        if missing_pack_data['num_puzzles'][i] == 1:
            missing_pack_data['piece_count_2'][i] = 0
        else:
            missing_pack_data['piece_count_2'][i] = int(groupby_brand_pieces_2[missing_pack_data['brand_2'][i]])
    except Exception as e:
        pass
#         print(e)

In [35]:
# brand level most common difficulty values -- KNOWN data
groupby_brand_diff_1 = df.groupby("brand_1")["difficulty_rating_1"].agg(pd.Series.mode)
groupby_brand_diff_2 = df.groupby("brand_2")["difficulty_rating_2"].agg(pd.Series.mode)

In [36]:
# getting difficulty_rating_1
for i in range(0, len(missing_pack_data)):
    try:
        missing_pack_data['difficulty_rating_1'][i] = groupby_brand_diff_1[missing_pack_data['brand_1'][i]]
        # getting difficulty_rating_2 if needed
        if missing_pack_data['num_puzzles'][i] == 1:
            missing_pack_data['difficulty_rating_2'][i] = 'not_applicable'
        else:
            missing_pack_data['difficulty_rating_2'][i] = (groupby_brand_diff_2[missing_pack_data['brand_2'][i]])
    except Exception as e:
        pass

In [37]:
# if anything still missing in piece_count, difficulty_rating
# fill it with global average

In [38]:
missing_pack_data.isnull().sum()

puzzlepack               0
piece_count_1          150
piece_count_2          217
difficulty_rating_1    150
difficulty_rating_2    217
brand_1                  0
brand_2                  0
num_puzzles              0
dtype: int64

In [39]:
global_mean_pc_1 = int(df['piece_count_1'].mean())
global_mean_pc_2 = int(df['piece_count_2'].mean())
global_mode_df_1 = df['difficulty_rating_1'].mode()
global_mode_df_2 = df['difficulty_rating_2'].mode()

In [40]:
missing_pack_data['piece_count_1'].fillna(global_mean_pc_1, inplace=True)
missing_pack_data['piece_count_2'].fillna(global_mean_pc_2, inplace=True)

missing_pack_data['difficulty_rating_1'].fillna('global_mode_df_1', inplace=True)
missing_pack_data['difficulty_rating_2'].fillna('global_mode_df_2', inplace=True)

### Making a final packs data

In [41]:
packs_df = packs_df[['pack_name', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1', 
                             'difficulty_rating_2', 'brand_1', 'brand_2', 'num_puzzles']]

In [42]:
missing_pack_data = missing_pack_data.rename(columns = {'puzzlepack':'pack_name'})

In [209]:
len(set(missing_pack_data.pack_name.unique()).intersection(set(packs_df.pack_name.unique())))

0

In [43]:
packs_updated = pd.concat([packs_df, missing_pack_data])

In [49]:
df_cleaned = member_holdtime_df.merge(packs_updated, left_on='puzzlepack', right_on='pack_name', how='left')
df_cleaned.head(2)

Unnamed: 0,memberID,holdtime,puzzlepack,pack_name,piece_count_1,piece_count_2,difficulty_rating_1,difficulty_rating_2,brand_1,brand_2,num_puzzles
0,member1,2.939411,Artifact Puzzles Justin Hillgrove Word Travels...,Artifact Puzzles Justin Hillgrove Word Travels...,456,548,A-Easy,Average,Artifact,Artifact,2
1,member1,0.998885,DaVici Puzzles Full Moon Feast DaVici Puzzles ...,DaVici Puzzles Full Moon Feast DaVici Puzzles ...,195,220,A-Easy,Hard,DaVici,DaVici,2


In [50]:
df_cleaned.isnull().sum()

memberID               0
holdtime               0
puzzlepack             0
pack_name              0
piece_count_1          0
piece_count_2          0
difficulty_rating_1    0
difficulty_rating_2    0
brand_1                0
brand_2                0
num_puzzles            0
dtype: int64

In [51]:
df_cleaned.shape, df.shape, member_holdtime_df.shape

((33817, 11), (19387, 11), (19387, 3))