In [176]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import Counter

# local imports
from prepare import *

In [177]:
raw_train = pd.read_csv('data/train.csv')
raw_train_labels = pd.read_csv('data/train_labels.csv')
raw_test = pd.read_csv('data/test.csv')
specs = pd.read_csv('data/specs.csv')
sample = pd.read_csv('data/sample_submission.csv')

## Shorten dataframes to speed up testing

In [178]:
%%script false

frames = [raw_train, raw_train_labels, raw_test]#, specs, sample]

for i, df in enumerate(frames):
    frames[i] = frames[i][0:round(0.1*frames[i].shape[0])]
    print(frames[i].shape)

Couldn't find program: 'false'


In [237]:
train = raw_train.copy()
train_labels = raw_train_labels.copy()
test = raw_test.copy()

## Run modified remove_dead_weight function

In [238]:
train = remove_dead_weight(train, train_labels)
test = remove_dead_weight(test, train_labels, test_set=True)

In [239]:
train.shape

(7696400, 11)

## Process time

In [240]:
# convert timestamp to correct datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format=True)
test['timestamp'] = pd.to_datetime(test['timestamp'], infer_datetime_format=True)

In [241]:
# have the ability to break datetime into year, month, day, etc with add_datepart in library

## Encode data

https://towardsdatascience.com/understanding-feature-engineering-part-2-categorical-data-f54324193e63

In [242]:
train, test = encode_col(train, test, 'title')
train, test = encode_col(train, test, 'world')

In [243]:
train.shape

(7696400, 59)

## Bin event codes

In [244]:
#TBD

## Process Data

https://www.kaggle.com/ragnar123/truncated-train-ensemble

In [245]:
compiled_train_data = process_data(train)
compiled_test_data = process_data(test, test_set=True)

HBox(children=(IntProgress(value=0, max=3614), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3614), HTML(value='')))




In [246]:
len(compiled_train_data)

17690

In [279]:
compiled_train = pd.concat(compiled_train_data, axis=1).T

In [280]:
compiled_train.drop('installation_id_slice', axis=1, inplace=True)

In [281]:
compiled_test = pd.concat(compiled_test_data, axis=1).T

In [282]:
print(set(compiled_train.columns).difference(set(compiled_test.columns)))
compiled_train.shape, compiled_test.shape

set()


((17690, 63), (3614, 63))

## Add accuracy group from train_labels dataframe

In [251]:
# just verifying
train_labels.columns

Index(['game_session', 'installation_id', 'title', 'num_correct',
       'num_incorrect', 'accuracy', 'accuracy_group'],
      dtype='object')

In [252]:
compiled_train = pd.merge(compiled_train, train_labels[['installation_id','game_session','accuracy_group']], \
                  on=['installation_id','game_session'])
compiled_train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,...,title_Welcome to Lost Lagoon!,world_CRYSTALCAVES,world_MAGMAPEAK,world_NONE,world_TREETOPCITY,total_event_count,avg_event_count,total_game_time,avg_game_time,accuracy_group
0,3bfd1a65,901acc108f55a5a1,2019-08-06 05:22:01.344000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,...,2,0,235,2,411,36473,56.2855,35855793,5122260.0,3
1,f56e0afc,77b8ee947eb84b4e,2019-08-06 05:35:19.167000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Bird Measurer (Assessment),Assessment,...,2,0,235,2,907,76743,67.083,71139760,6467250.0,0
2,3bfd1a65,6bdf9623adc94d89,2019-08-06 05:37:50.020000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,...,2,0,235,2,994,80571,65.4517,75295894,6274660.0,3
3,3bfd1a65,9501794defd84e4d,2019-08-06 20:34:53.812000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,...,3,0,728,3,1412,134171,62.609,196331339,8924150.0,2
4,f56e0afc,a9ef3ecb3d1acc6a,2019-08-06 20:49:59.095000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Bird Measurer (Assessment),Assessment,...,3,0,728,3,1839,159744,62.1572,230347603,8859520.0,3


## Convert as many columns as possible to numeric

In [253]:
def numerize(df):
    for i, column in enumerate(df.columns):
        col = df.columns[i]
        df[col] = pd.to_numeric(df[col], errors='ignore')
    return df

In [254]:
# for i, column in enumerate(compiled_train.columns):
#     col = compiled_train.columns[i]
#     compiled_train[col] = pd.to_numeric(compiled_train[col], errors='ignore')

# compiled_train.info()

compiled_train = numerize(compiled_train)
compiled_test = numerize(compiled_test)


In [255]:
compiled_train.shape, compiled_test.shape

((17690, 64), (3614, 63))

## Train RF model

In [256]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [257]:
scaler = StandardScaler()

# get numeric fields only
#X = compiled_train._get_numeric_data().drop('accuracy_group', axis=1).iloc[:,:-3]

X = compiled_train.drop('accuracy_group', axis=1)._get_numeric_data()
print(X.shape)
#print(compiled_test.shape)
X_scaled = scaler.fit_transform(X)
y = compiled_train.accuracy_group

compiled_test_scaled = scaler.transform(compiled_test._get_numeric_data())
print(compiled_test_scaled.shape)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

(17690, 56)
(3614, 56)




0.48191068400226117

In [269]:
#test_prediction = rf.predict(compiled_test_scaled)
#list(zip(compiled_test.installation_id, test_prediction))

In [270]:
# To create a submission:
# submission = pd.DataFrame()
# submission['installation_id'] = compiled_test.installation_id
# submission['accuracy_group'] = test_prediction
# submission.head()

In [271]:
#submission.accuracy_group.value_counts()

In [272]:
#submission.to_csv('submission_1_1_2020.csv',index=False)

## Balance classes

In [273]:
from sklearn.utils import resample

# Separate classes
df_0 = compiled_train[compiled_train.accuracy_group==0]
df_1 = compiled_train[compiled_train.accuracy_group==1]
df_2 = compiled_train[compiled_train.accuracy_group==2]
df_3 = compiled_train[compiled_train.accuracy_group==3]

# Downsample 3, 0, 1 to 2's level - n=419
resampled_dfs = [df_2]

for i in [df_0,df_1,df_3]:  
    downsampled_df = resample(i, 
                              replace=False, # sample without replacement
                              n_samples = min(compiled_train.accuracy_group.value_counts()), #  to match minority
                              random_state = 42)  # reproducibility
    resampled_dfs.append(downsampled_df)

balanced_compiled_train = pd.concat(resampled_dfs, axis=0)

balanced_compiled_train.accuracy_group.value_counts()

3    2205
2    2205
1    2205
0    2205
Name: accuracy_group, dtype: int64

In [274]:
scaler = StandardScaler()

X = balanced_compiled_train._get_numeric_data().drop('accuracy_group', axis=1)
X_scaled = scaler.fit_transform(X)
y = balanced_compiled_train.accuracy_group

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)



0.3344671201814059

In [275]:
test_prediction = rf.predict(compiled_test_scaled)
list(zip(compiled_test.installation_id, test_prediction))

[('0006a69f', 0),
 ('0006c192', 3),
 ('00129856', 0),
 ('001d0ed0', 2),
 ('00225f67', 0),
 ('00279ac5', 1),
 ('002db7e3', 2),
 ('003372b0', 1),
 ('004c2091', 0),
 ('00634433', 0),
 ('00667b88', 0),
 ('00691033', 1),
 ('00a0dbeb', 0),
 ('00a53963', 0),
 ('00ad158e', 0),
 ('00b9d8e6', 0),
 ('00cef781', 0),
 ('00e17272', 0),
 ('00e536bf', 1),
 ('00fa8681', 1),
 ('00fc65b6', 2),
 ('010bc1d5', 0),
 ('01120f12', 0),
 ('0153c957', 0),
 ('0155dd86', 0),
 ('015776b4', 0),
 ('01582211', 0),
 ('0160e7c5', 0),
 ('01825124', 2),
 ('01bdd720', 3),
 ('01cc53f3', 3),
 ('01d5e1f5', 3),
 ('01f71702', 1),
 ('01faa4bf', 3),
 ('022183ab', 3),
 ('0235fe9a', 2),
 ('023c4bcb', 0),
 ('02490b06', 0),
 ('026035e9', 0),
 ('026e3733', 1),
 ('027b3c4c', 3),
 ('0281f6bf', 1),
 ('02875dc2', 2),
 ('0293ae74', 0),
 ('02aa515f', 3),
 ('02b1fab4', 1),
 ('02c975bc', 0),
 ('02cdd48e', 0),
 ('02cdfffd', 2),
 ('02e76f43', 0),
 ('031af3b8', 0),
 ('0328fc8f', 0),
 ('03401e19', 3),
 ('03560e11', 0),
 ('03957682', 2),
 ('039e4ac

In [276]:
submission = pd.DataFrame()
submission['installation_id'] = compiled_test.installation_id
submission['accuracy_group'] = test_prediction
submission.head()

Unnamed: 0,installation_id,accuracy_group
3783,0006a69f,0
2220,0006c192,3
826,00129856,0
1019,001d0ed0,2
928,00225f67,0


In [277]:
submission.accuracy_group.value_counts()

0    1964
1     724
2     552
3     374
Name: accuracy_group, dtype: int64

In [278]:
submission.to_csv('submission_1_1_2020.csv',index=False)

## determine why installation_ids in sample submission aren't found

In [291]:
sample.installation_id.nunique(), test.installation_id.nunique()

(1000, 3614)

In [292]:
compiled_test.installation_id.nunique()

3614