## Run ensemble from v1 on full dataset format commonly used on Kaggle

In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter

# local imports
from prepare import *
from evaluate import *

### Read in initial datasets if needed

In [2]:
#raw_train, raw_train_labels, raw_test, specs, sample = read_raw_csvs()
#raw_train_labels = pd.read_csv('data/train_labels.csv')

### Load large train/test features from Josh's work

In [3]:
reduced_train = pd.read_csv('reduce_train.csv')
reduced_test = pd.read_csv('reduce_test.csv')
reduced_train.shape, reduced_test.shape

((17690, 891), (1000, 891))

In [4]:
#train = balance_classes(train)  ## try with balanced classes     --- results much worse

### Start throwing model mud at the wall

In [5]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
                                ExtraTreesClassifier, BaggingClassifier, \
                                GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')  #Ridge classifier throws some warnings about ill-conditioned matrix

### Baseline accuracy of 50% (or 25% if balanced)

In [6]:
reduced_train.accuracy_group.value_counts(normalize=True)

3    0.500000
0    0.239062
1    0.136292
2    0.124647
Name: accuracy_group, dtype: float64

# Initialize Models and start testing accuracy

#### KNN and SVC will require scaling, others shouldn't

In [7]:
rf = RandomForestClassifier()
#lr = LogisticRegression()               ## if they are commented out, they weren't performing well (or operator error...)
#sgd = SGDClassifier()
rc = RidgeClassifier()
#nb = GaussianNB()
ac = AdaBoostClassifier()
et = ExtraTreesClassifier()
bc = BaggingClassifier()
gbc = GradientBoostingClassifier()
clf = CatBoostClassifier(
    loss_function='MultiClass',
    task_type="CPU",
    learning_rate=0.01,
    iterations=2000,
    od_type="Iter",
    early_stopping_rounds=500,
    random_seed=42
    )

In [8]:
knn = KNeighborsClassifier()
svc = SVC(probability=True, verbose=1)

## Evaluate model performance

In [10]:
# Catboost - accuracy 56%a
quick_eval(reduced_train, clf)#cv=True)

0:	learn: 1.3789760	total: 162ms	remaining: 5m 22s
1:	learn: 1.3721769	total: 250ms	remaining: 4m 10s
2:	learn: 1.3653490	total: 340ms	remaining: 3m 46s
3:	learn: 1.3586184	total: 440ms	remaining: 3m 39s
4:	learn: 1.3523827	total: 531ms	remaining: 3m 31s
5:	learn: 1.3462306	total: 626ms	remaining: 3m 28s
6:	learn: 1.3402063	total: 733ms	remaining: 3m 28s
7:	learn: 1.3345847	total: 837ms	remaining: 3m 28s
8:	learn: 1.3286372	total: 932ms	remaining: 3m 26s
9:	learn: 1.3232815	total: 1.03s	remaining: 3m 25s
10:	learn: 1.3178020	total: 1.13s	remaining: 3m 25s
11:	learn: 1.3125151	total: 1.23s	remaining: 3m 23s
12:	learn: 1.3071519	total: 1.32s	remaining: 3m 21s
13:	learn: 1.3021157	total: 1.42s	remaining: 3m 20s
14:	learn: 1.2971166	total: 1.52s	remaining: 3m 21s
15:	learn: 1.2920667	total: 1.65s	remaining: 3m 24s
16:	learn: 1.2873951	total: 1.75s	remaining: 3m 23s
17:	learn: 1.2827922	total: 1.84s	remaining: 3m 23s
18:	learn: 1.2784025	total: 1.94s	remaining: 3m 22s
19:	learn: 1.2741087	t

160:	learn: 1.0533977	total: 15.7s	remaining: 2m 59s
161:	learn: 1.0529702	total: 15.8s	remaining: 2m 59s
162:	learn: 1.0525839	total: 15.9s	remaining: 2m 59s
163:	learn: 1.0521497	total: 16s	remaining: 2m 59s
164:	learn: 1.0516652	total: 16.1s	remaining: 2m 59s
165:	learn: 1.0511167	total: 16.2s	remaining: 2m 59s
166:	learn: 1.0505464	total: 16.3s	remaining: 2m 59s
167:	learn: 1.0500819	total: 16.4s	remaining: 2m 58s
168:	learn: 1.0495961	total: 16.5s	remaining: 2m 58s
169:	learn: 1.0491341	total: 16.6s	remaining: 2m 58s
170:	learn: 1.0486838	total: 16.7s	remaining: 2m 58s
171:	learn: 1.0482236	total: 16.8s	remaining: 2m 58s
172:	learn: 1.0476574	total: 16.9s	remaining: 2m 58s
173:	learn: 1.0473671	total: 17s	remaining: 2m 58s
174:	learn: 1.0468848	total: 17.1s	remaining: 2m 58s
175:	learn: 1.0465825	total: 17.2s	remaining: 2m 58s
176:	learn: 1.0463022	total: 17.3s	remaining: 2m 57s
177:	learn: 1.0459140	total: 17.4s	remaining: 2m 57s
178:	learn: 1.0455337	total: 17.5s	remaining: 2m 5

316:	learn: 1.0114634	total: 30.8s	remaining: 2m 43s
317:	learn: 1.0113044	total: 30.9s	remaining: 2m 43s
318:	learn: 1.0110876	total: 31s	remaining: 2m 43s
319:	learn: 1.0109217	total: 31.1s	remaining: 2m 43s
320:	learn: 1.0106935	total: 31.2s	remaining: 2m 43s
321:	learn: 1.0105822	total: 31.3s	remaining: 2m 43s
322:	learn: 1.0104520	total: 31.4s	remaining: 2m 43s
323:	learn: 1.0102891	total: 31.5s	remaining: 2m 42s
324:	learn: 1.0101846	total: 31.6s	remaining: 2m 42s
325:	learn: 1.0100800	total: 31.7s	remaining: 2m 42s
326:	learn: 1.0099381	total: 31.8s	remaining: 2m 42s
327:	learn: 1.0098016	total: 31.9s	remaining: 2m 42s
328:	learn: 1.0096293	total: 32s	remaining: 2m 42s
329:	learn: 1.0095519	total: 32.1s	remaining: 2m 42s
330:	learn: 1.0093628	total: 32.2s	remaining: 2m 42s
331:	learn: 1.0092107	total: 32.2s	remaining: 2m 42s
332:	learn: 1.0090757	total: 32.3s	remaining: 2m 41s
333:	learn: 1.0089361	total: 32.4s	remaining: 2m 41s
334:	learn: 1.0087106	total: 32.5s	remaining: 2m 4

473:	learn: 0.9899125	total: 46.2s	remaining: 2m 28s
474:	learn: 0.9898290	total: 46.3s	remaining: 2m 28s
475:	learn: 0.9897649	total: 46.4s	remaining: 2m 28s
476:	learn: 0.9897101	total: 46.5s	remaining: 2m 28s
477:	learn: 0.9895860	total: 46.6s	remaining: 2m 28s
478:	learn: 0.9894730	total: 46.7s	remaining: 2m 28s
479:	learn: 0.9894538	total: 46.8s	remaining: 2m 28s
480:	learn: 0.9893737	total: 46.9s	remaining: 2m 28s
481:	learn: 0.9892662	total: 47s	remaining: 2m 28s
482:	learn: 0.9891803	total: 47.1s	remaining: 2m 27s
483:	learn: 0.9890556	total: 47.2s	remaining: 2m 27s
484:	learn: 0.9889458	total: 47.3s	remaining: 2m 27s
485:	learn: 0.9887949	total: 47.4s	remaining: 2m 27s
486:	learn: 0.9886821	total: 47.5s	remaining: 2m 27s
487:	learn: 0.9884861	total: 47.6s	remaining: 2m 27s
488:	learn: 0.9883454	total: 47.7s	remaining: 2m 27s
489:	learn: 0.9882378	total: 47.8s	remaining: 2m 27s
490:	learn: 0.9882112	total: 48s	remaining: 2m 27s
491:	learn: 0.9881198	total: 48.1s	remaining: 2m 2

631:	learn: 0.9744480	total: 1m 1s	remaining: 2m 14s
632:	learn: 0.9744018	total: 1m 2s	remaining: 2m 14s
633:	learn: 0.9743357	total: 1m 2s	remaining: 2m 13s
634:	learn: 0.9742796	total: 1m 2s	remaining: 2m 13s
635:	learn: 0.9742144	total: 1m 2s	remaining: 2m 13s
636:	learn: 0.9740709	total: 1m 2s	remaining: 2m 13s
637:	learn: 0.9740042	total: 1m 2s	remaining: 2m 13s
638:	learn: 0.9739825	total: 1m 2s	remaining: 2m 13s
639:	learn: 0.9738584	total: 1m 2s	remaining: 2m 13s
640:	learn: 0.9737439	total: 1m 2s	remaining: 2m 13s
641:	learn: 0.9736884	total: 1m 2s	remaining: 2m 13s
642:	learn: 0.9736630	total: 1m 3s	remaining: 2m 12s
643:	learn: 0.9735458	total: 1m 3s	remaining: 2m 12s
644:	learn: 0.9735176	total: 1m 3s	remaining: 2m 12s
645:	learn: 0.9734091	total: 1m 3s	remaining: 2m 12s
646:	learn: 0.9733115	total: 1m 3s	remaining: 2m 12s
647:	learn: 0.9732105	total: 1m 3s	remaining: 2m 12s
648:	learn: 0.9730449	total: 1m 3s	remaining: 2m 12s
649:	learn: 0.9729760	total: 1m 3s	remaining: 

789:	learn: 0.9621529	total: 1m 17s	remaining: 1m 58s
790:	learn: 0.9620838	total: 1m 17s	remaining: 1m 58s
791:	learn: 0.9619781	total: 1m 17s	remaining: 1m 58s
792:	learn: 0.9619141	total: 1m 17s	remaining: 1m 57s
793:	learn: 0.9618420	total: 1m 17s	remaining: 1m 57s
794:	learn: 0.9618104	total: 1m 17s	remaining: 1m 57s
795:	learn: 0.9617475	total: 1m 17s	remaining: 1m 57s
796:	learn: 0.9617240	total: 1m 17s	remaining: 1m 57s
797:	learn: 0.9616626	total: 1m 17s	remaining: 1m 57s
798:	learn: 0.9616243	total: 1m 18s	remaining: 1m 57s
799:	learn: 0.9615548	total: 1m 18s	remaining: 1m 57s
800:	learn: 0.9615310	total: 1m 18s	remaining: 1m 57s
801:	learn: 0.9614296	total: 1m 18s	remaining: 1m 57s
802:	learn: 0.9613327	total: 1m 18s	remaining: 1m 56s
803:	learn: 0.9613033	total: 1m 18s	remaining: 1m 56s
804:	learn: 0.9611857	total: 1m 18s	remaining: 1m 56s
805:	learn: 0.9610703	total: 1m 18s	remaining: 1m 56s
806:	learn: 0.9610437	total: 1m 18s	remaining: 1m 56s
807:	learn: 0.9609483	total:

941:	learn: 0.9504898	total: 1m 32s	remaining: 1m 43s
942:	learn: 0.9503786	total: 1m 32s	remaining: 1m 43s
943:	learn: 0.9503309	total: 1m 32s	remaining: 1m 43s
944:	learn: 0.9502401	total: 1m 32s	remaining: 1m 43s
945:	learn: 0.9501529	total: 1m 32s	remaining: 1m 43s
946:	learn: 0.9500710	total: 1m 32s	remaining: 1m 43s
947:	learn: 0.9499759	total: 1m 32s	remaining: 1m 43s
948:	learn: 0.9499340	total: 1m 32s	remaining: 1m 42s
949:	learn: 0.9498921	total: 1m 33s	remaining: 1m 42s
950:	learn: 0.9498180	total: 1m 33s	remaining: 1m 42s
951:	learn: 0.9497258	total: 1m 33s	remaining: 1m 42s
952:	learn: 0.9496750	total: 1m 33s	remaining: 1m 42s
953:	learn: 0.9495967	total: 1m 33s	remaining: 1m 42s
954:	learn: 0.9494869	total: 1m 33s	remaining: 1m 42s
955:	learn: 0.9494200	total: 1m 33s	remaining: 1m 42s
956:	learn: 0.9493718	total: 1m 33s	remaining: 1m 42s
957:	learn: 0.9492783	total: 1m 33s	remaining: 1m 42s
958:	learn: 0.9491486	total: 1m 33s	remaining: 1m 41s
959:	learn: 0.9490104	total:

1093:	learn: 0.9379956	total: 1m 49s	remaining: 1m 30s
1094:	learn: 0.9378367	total: 1m 49s	remaining: 1m 30s
1095:	learn: 0.9378037	total: 1m 49s	remaining: 1m 30s
1096:	learn: 0.9377636	total: 1m 49s	remaining: 1m 30s
1097:	learn: 0.9376633	total: 1m 49s	remaining: 1m 30s
1098:	learn: 0.9375795	total: 1m 49s	remaining: 1m 30s
1099:	learn: 0.9374706	total: 1m 50s	remaining: 1m 30s
1100:	learn: 0.9373416	total: 1m 50s	remaining: 1m 30s
1101:	learn: 0.9372468	total: 1m 50s	remaining: 1m 29s
1102:	learn: 0.9371524	total: 1m 50s	remaining: 1m 29s
1103:	learn: 0.9371027	total: 1m 50s	remaining: 1m 29s
1104:	learn: 0.9370561	total: 1m 50s	remaining: 1m 29s
1105:	learn: 0.9369734	total: 1m 50s	remaining: 1m 29s
1106:	learn: 0.9368523	total: 1m 51s	remaining: 1m 29s
1107:	learn: 0.9367826	total: 1m 51s	remaining: 1m 29s
1108:	learn: 0.9367006	total: 1m 51s	remaining: 1m 29s
1109:	learn: 0.9366032	total: 1m 51s	remaining: 1m 29s
1110:	learn: 0.9365093	total: 1m 51s	remaining: 1m 29s
1111:	lear

1245:	learn: 0.9252566	total: 2m 9s	remaining: 1m 18s
1246:	learn: 0.9252124	total: 2m 9s	remaining: 1m 18s
1247:	learn: 0.9251377	total: 2m 9s	remaining: 1m 17s
1248:	learn: 0.9250746	total: 2m 9s	remaining: 1m 17s
1249:	learn: 0.9250339	total: 2m 9s	remaining: 1m 17s
1250:	learn: 0.9249415	total: 2m 9s	remaining: 1m 17s
1251:	learn: 0.9248122	total: 2m 9s	remaining: 1m 17s
1252:	learn: 0.9247234	total: 2m 10s	remaining: 1m 17s
1253:	learn: 0.9246742	total: 2m 10s	remaining: 1m 17s
1254:	learn: 0.9246337	total: 2m 10s	remaining: 1m 17s
1255:	learn: 0.9245915	total: 2m 10s	remaining: 1m 17s
1256:	learn: 0.9244892	total: 2m 10s	remaining: 1m 17s
1257:	learn: 0.9244422	total: 2m 10s	remaining: 1m 17s
1258:	learn: 0.9243396	total: 2m 10s	remaining: 1m 16s
1259:	learn: 0.9242807	total: 2m 10s	remaining: 1m 16s
1260:	learn: 0.9242339	total: 2m 11s	remaining: 1m 16s
1261:	learn: 0.9241784	total: 2m 11s	remaining: 1m 16s
1262:	learn: 0.9241064	total: 2m 11s	remaining: 1m 16s
1263:	learn: 0.92

1397:	learn: 0.9143319	total: 2m 25s	remaining: 1m 2s
1398:	learn: 0.9142657	total: 2m 25s	remaining: 1m 2s
1399:	learn: 0.9142161	total: 2m 26s	remaining: 1m 2s
1400:	learn: 0.9141813	total: 2m 26s	remaining: 1m 2s
1401:	learn: 0.9141070	total: 2m 26s	remaining: 1m 2s
1402:	learn: 0.9140188	total: 2m 26s	remaining: 1m 2s
1403:	learn: 0.9139602	total: 2m 26s	remaining: 1m 2s
1404:	learn: 0.9138480	total: 2m 26s	remaining: 1m 2s
1405:	learn: 0.9138123	total: 2m 26s	remaining: 1m 2s
1406:	learn: 0.9137255	total: 2m 26s	remaining: 1m 1s
1407:	learn: 0.9136743	total: 2m 27s	remaining: 1m 1s
1408:	learn: 0.9136092	total: 2m 27s	remaining: 1m 1s
1409:	learn: 0.9135407	total: 2m 27s	remaining: 1m 1s
1410:	learn: 0.9134316	total: 2m 27s	remaining: 1m 1s
1411:	learn: 0.9134092	total: 2m 27s	remaining: 1m 1s
1412:	learn: 0.9132932	total: 2m 27s	remaining: 1m 1s
1413:	learn: 0.9132464	total: 2m 27s	remaining: 1m 1s
1414:	learn: 0.9131810	total: 2m 27s	remaining: 1m 1s
1415:	learn: 0.9131313	total

1552:	learn: 0.9044349	total: 2m 41s	remaining: 46.6s
1553:	learn: 0.9043360	total: 2m 41s	remaining: 46.5s
1554:	learn: 0.9043056	total: 2m 42s	remaining: 46.4s
1555:	learn: 0.9042351	total: 2m 42s	remaining: 46.3s
1556:	learn: 0.9041689	total: 2m 42s	remaining: 46.1s
1557:	learn: 0.9041165	total: 2m 42s	remaining: 46s
1558:	learn: 0.9040699	total: 2m 42s	remaining: 45.9s
1559:	learn: 0.9039963	total: 2m 42s	remaining: 45.8s
1560:	learn: 0.9039821	total: 2m 42s	remaining: 45.7s
1561:	learn: 0.9039325	total: 2m 42s	remaining: 45.6s
1562:	learn: 0.9038953	total: 2m 42s	remaining: 45.5s
1563:	learn: 0.9037929	total: 2m 42s	remaining: 45.4s
1564:	learn: 0.9037375	total: 2m 42s	remaining: 45.3s
1565:	learn: 0.9036690	total: 2m 43s	remaining: 45.2s
1566:	learn: 0.9036167	total: 2m 43s	remaining: 45.1s
1567:	learn: 0.9035243	total: 2m 43s	remaining: 45s
1568:	learn: 0.9034878	total: 2m 43s	remaining: 44.9s
1569:	learn: 0.9034176	total: 2m 43s	remaining: 44.8s
1570:	learn: 0.9033759	total: 2m

1705:	learn: 0.8954994	total: 2m 57s	remaining: 30.6s
1706:	learn: 0.8953797	total: 2m 57s	remaining: 30.5s
1707:	learn: 0.8953293	total: 2m 58s	remaining: 30.4s
1708:	learn: 0.8953003	total: 2m 58s	remaining: 30.3s
1709:	learn: 0.8952854	total: 2m 58s	remaining: 30.2s
1710:	learn: 0.8952426	total: 2m 58s	remaining: 30.1s
1711:	learn: 0.8951757	total: 2m 58s	remaining: 30s
1712:	learn: 0.8951124	total: 2m 58s	remaining: 29.9s
1713:	learn: 0.8950606	total: 2m 58s	remaining: 29.8s
1714:	learn: 0.8950024	total: 2m 58s	remaining: 29.7s
1715:	learn: 0.8949233	total: 2m 58s	remaining: 29.6s
1716:	learn: 0.8948796	total: 2m 58s	remaining: 29.5s
1717:	learn: 0.8948146	total: 2m 59s	remaining: 29.4s
1718:	learn: 0.8947101	total: 2m 59s	remaining: 29.3s
1719:	learn: 0.8946956	total: 2m 59s	remaining: 29.2s
1720:	learn: 0.8946608	total: 2m 59s	remaining: 29.1s
1721:	learn: 0.8946444	total: 2m 59s	remaining: 29s
1722:	learn: 0.8945335	total: 2m 59s	remaining: 28.9s
1723:	learn: 0.8944854	total: 2m

1860:	learn: 0.8865785	total: 3m 13s	remaining: 14.5s
1861:	learn: 0.8864802	total: 3m 13s	remaining: 14.4s
1862:	learn: 0.8864378	total: 3m 13s	remaining: 14.3s
1863:	learn: 0.8863981	total: 3m 14s	remaining: 14.2s
1864:	learn: 0.8863048	total: 3m 14s	remaining: 14.1s
1865:	learn: 0.8862373	total: 3m 14s	remaining: 13.9s
1866:	learn: 0.8861645	total: 3m 14s	remaining: 13.8s
1867:	learn: 0.8861000	total: 3m 14s	remaining: 13.7s
1868:	learn: 0.8860602	total: 3m 14s	remaining: 13.6s
1869:	learn: 0.8860032	total: 3m 14s	remaining: 13.5s
1870:	learn: 0.8859337	total: 3m 14s	remaining: 13.4s
1871:	learn: 0.8858878	total: 3m 14s	remaining: 13.3s
1872:	learn: 0.8857935	total: 3m 14s	remaining: 13.2s
1873:	learn: 0.8857759	total: 3m 15s	remaining: 13.1s
1874:	learn: 0.8857560	total: 3m 15s	remaining: 13s
1875:	learn: 0.8856780	total: 3m 15s	remaining: 12.9s
1876:	learn: 0.8856668	total: 3m 15s	remaining: 12.8s
1877:	learn: 0.8856156	total: 3m 15s	remaining: 12.7s
1878:	learn: 0.8855790	total: 

('<catboost.core.CatBoostClassifier object at 0x000001934B70A550>',
 0.6201243640474845)

In [11]:
quick_eval(reduced_train, rf, cv=True)
#quick_eval(reduced_train, lr, cv=True)
#quick_eval(reduced_train, sgd, cv=True)
quick_eval(reduced_train, rc, cv=True)
#quick_eval(reduced_train, nb, cv=True)
quick_eval(reduced_train, ac, cv=True)
quick_eval(reduced_train, et, cv=True)
quick_eval(reduced_train, bc, cv=True)
quick_eval(reduced_train, gbc, cv=True)
#quick_eval(reduced_train, clf, cv=True)


#quick_eval(reduced_train, knn, scale=True, cv=True)
#quick_eval(reduced_train, svc, scale=True, cv=True)

The CV score of RandomForestClassifier is 0.5630864895421142
The CV score of RidgeClassifier is 0.5280949689089882
The CV score of AdaBoostClassifier is 0.5296212549462973
The CV score of ExtraTreesClassifier is 0.5399095534200112
The CV score of BaggingClassifier is 0.5112492933860939
The CV score of GradientBoostingClassifier is 0.43521763708309785


('GradientBoostingClassifier', 0.43521763708309785)

## more models, to include sklearn neural net

In [15]:
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier

In [16]:
mlp = MLPClassifier()
gpc = GaussianProcessClassifier() #takes a long time
#rbf = RBF()
dt = DecisionTreeClassifier()

In [None]:
quick_eval(reduced_train, mlp, cv=True)
quick_eval(reduced_train, gpc) #   TRY LATER, THIS WAS VERY SLOW
#quick_eval(reduced_train, rbf, cv=True)   poor
quick_eval(reduced_train, dt, cv=True)    

In [None]:
bc_rf = BaggingClassifier(
    base_estimator=RandomForestClassifier(max_depth=10),
    n_estimators=20)

bc_gbc = BaggingClassifier(
    base_estimator=GradientBoostingClassifier(),
    n_estimators=20)

bc_abc = BaggingClassifier(
    base_estimator=AdaBoostClassifier(),
    n_estimators=20)

quick_eval(reduced_train, bc_rf)    # 0.549 with 20 estimators max depth of 10  (.377 with balanced)    0.551 with 200 
quick_eval(reduced_train, bc_gbc)   # 0.576 with 20 estimators (.389 with balanced)                     0.578 with 200
quick_eval(reduced_train, bc_abc)   # 0.563 with 20 estimators  (.374 with balanced)                    0.561 with 200

## Model Ensembling

In [None]:
vc = VotingClassifier(estimators=[
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    #('svc', svc),
    ('gpc', gpc),
    ('rc', rc),
    ('mlp', mlp),
    ('dt', dt),
    ('bc_rf', bc_rf),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
    #('catboost', clf)
    ],
     n_jobs=-1,
     voting='hard')

quick_eval(reduced_train, vc)#, cv=True)     #initial w/o catboost .564

In [None]:
estimators = [
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    ('gpc', gpc),
    #('svc', svc),
    ('rc', rc),
    ('mlp', mlp),
    ('dt', dt),
    ('bc_rf', bc_rf),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
]
stacking_clf = StackingClassifier(estimators=estimators, 
                final_estimator=RandomForestClassifier(), n_jobs=-1)        # created, load from pickle
quick_eval(reduced_train, stacking_clf)

In [40]:
##joblib.dump(stacking_clf, 'fitted_stacked_classifier.pkl')
##joblib.dump(vc, 'fitted_voting_classifier.pkl')

['fitted_voting_classifier.pkl']

## Create submission based on Brad's data prep

In [28]:
test_numerics_only = test._get_numeric_data()
test_prediction = stacking_clf.predict(test_numerics_only)

# To create a submission:
submission = pd.DataFrame()
submission['installation_id'] = test.installation_id
submission['accuracy_group'] = test_prediction
#submission.head()
submission.to_csv('preds_balanced.csv')
submission.accuracy_group.value_counts()

3    379
0    247
2    216
1    158
Name: accuracy_group, dtype: int64

In [42]:
test

Unnamed: 0,timestamp,event_count,event_code,game_time,title_12 Monkeys,title_Air Show,title_All Star Sorting,title_Balancing Act,title_Bird Measurer (Assessment),title_Bottle Filler (Activity),...,total_event_count,avg_event_count,avg_review_incorrect_feedback,avg_review_correct_feedback,total_rounds_beat,total_movies_skipped,total_movies_watched,total_elsewhere_clicks,total_help_button_clicks,total_play_again
867,1568296332193000000,1,2000,0,2.0,0.0,79.0,1.0,0.0,0.0,...,47503,54.726959,1363.212121,2750.333333,7,0,0,190,0,0
2718,1570652596209000000,1,2000,0,1.0,72.0,56.0,3.0,61.0,221.0,...,174649,64.232806,2279.760000,2611.152778,82,2,7,385,2,0
149,1569065301757000000,1,2000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,5191,34.606667,0.000000,0.000000,0,0,0,14,0,0
233,1564244890394000000,1,2000,0,1.0,0.0,78.0,0.0,0.0,0.0,...,9681,41.371795,3377.500000,2650.666667,3,0,0,28,1,0
951,1567793126197000000,1,2000,0,0.0,0.0,420.0,1.0,0.0,0.0,...,101904,107.042017,318.037037,2936.444444,25,0,1,134,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,1569635090390000000,1,2000,0,0.0,0.0,92.0,0.0,82.0,0.0,...,7407,34.938679,968.000000,2215.750000,7,0,0,41,0,1
302,1570987104776000000,1,2000,0,1.0,0.0,0.0,2.0,0.0,0.0,...,9459,31.217822,502.125000,3566.875000,7,0,0,56,0,2
525,1570480618937000000,1,2000,0,1.0,0.0,0.0,3.0,0.0,0.0,...,30362,57.722433,2216.500000,2076.300000,6,0,0,136,0,0
258,1568142042792000000,1,2000,0,0.0,0.0,0.0,1.0,0.0,0.0,...,9365,36.158301,1259.000000,3309.500000,2,0,0,45,2,0


## Try original datasets

In [88]:
#trainX = pd.read_csv('trainX.csv')
#test = pd.read_csv('trainY.csv')
#testX = pd.read_csv('testX.csv')

In [89]:
train = pd.read_csv('full_train.csv')

In [97]:
test = pd.read_csv('original_X_test.csv')

In [106]:
train = train.drop('game_session', axis=1)
test = test.drop('game_session', axis=1)

In [107]:
vc = VotingClassifier(estimators=[
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    #('svc', svc),
    ('rc', rc),
    ('mlp', mlp),
    ('dt', dt),
    ('bc_rf', bc_rf),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
    #('catboost', clf)
    ],
     n_jobs=-1,
     voting='hard')

quick_eval(train, vc)#, cv=True)     #initial w/o catboost .564

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


The accuracy of VotingClassifier is 0.6044260027662517


('VotingClassifier', 0.6044260027662517)

In [84]:
estimators = [
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    #('svc', svc),
    ('rc', rc),
    ('mlp', mlp),
    ('dt', dt),
    ('bc_rf', bc_rf),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
]
stacking_clf = StackingClassifier(estimators=estimators, 
                final_estimator=RandomForestClassifier(), n_jobs=-1)        # created, load from pickle
quick_eval(train, stacking_clf)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


The accuracy of StackingClassifier is 0.6085753803596127


('StackingClassifier', 0.6085753803596127)

In [104]:
test.head()

Unnamed: 0,game_session,Clip,Activity,Assessment,Game,Chow Time,Pirate's Tale,Chicken Balancer (Activity),Dino Drink,Bug Measurer (Activity),...,session_title_33,session_title_34,session_title_35,session_title_36,session_title_37,session_title_38,session_title_39,session_title_40,session_title_41,session_title_42
0,348d7f09f96af313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1fef5d54cb4b775a,0.0,0.0,0.111111,0.0,0.011361,0.0,0.159763,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,4b165a330a0bdd6c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,be0b655ad1fee30c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,46e8bbed71df7520,0.0,0.0,0.0,0.0,0.022516,0.0,0.326923,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Predict on original Datasets

In [108]:
#test_numerics_only = test._get_numeric_data()
final_prediction = vc.predict(test)
sample_sub = pd.read_csv('data/sample_submission.csv')
# To create a submission:
submission = pd.DataFrame()
submission['installation_id'] = sample_sub.installation_id
submission['accuracy_group'] = final_prediction
submission.head()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3
1,01242218,3
2,017c5718,3
3,01a44906,3
4,01bc6cb6,3


In [109]:
submission.accuracy_group.value_counts()

3    732
0    252
1     14
2      2
Name: accuracy_group, dtype: int64

In [110]:
submission.to_csv('preds2.csv', index=False)
submission.accuracy_group.value_counts()

3    732
0    252
1     14
2      2
Name: accuracy_group, dtype: int64

In [111]:
from sklearn.metrics import cohen_kappa_score

In [40]:
cohen_kappa_score(y_pred, y_test)

NameError: name 'cohen_kappa_score' is not defined

In [42]:
reduced_train = pd.read_csv('reduce_train.csv')
reduced_test = pd.read_csv('reduce_test.csv')
reduced_train.shape, reduced_test.shape

((17690, 891), (1000, 891))

In [44]:
quick_eval(train, rf, cv=True)
quick_eval(reduced_train, rf, cv=True)

The CV score of RandomForestClassifier is 0.5391746749576032
The CV score of RandomForestClassifier is 0.5603165630299605


('RandomForestClassifier', 0.5603165630299605)