In [1]:
import sys
sys.path.insert(0,'/home/watts/Software')

In [2]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, confusion_matrix, f1_score, accuracy_score

from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

from sklearn.utils import shuffle

import pickle



In [3]:

def save_classifier(fname, clf):
    # save the classifier
    with open(fname, 'wb') as fid:
        pickle.dump(clf, fid)

def load_classifier(fname):
    # load it again
    with open(fname, 'rb') as fid:
        clf = pickle.load(fid)
        return clf

In [4]:
df_train = np.load('../cache/train_stage2_fe2.npy')
df1 = pd.read_csv('../cache/stage2_labels.csv')
y = df1['y'].values

df_test = np.load('../cache/test_stage2_fe2.npy')
df = pd.read_csv('../cache/stage2_test_id.csv')
pid = df.ID.values

In [5]:
len(pid)

986

In [6]:
df_train.shape

(3689, 4689)

In [7]:
df_test.shape

(986, 4689)

In [8]:
# y = y - 1 #fix for zero bound array

In [9]:
print(y.shape)

(3689,)


In [10]:
np.unique(y)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [11]:
wts_per_class = np.load('../cache/stage2_train_weights_per_class.npy')
wts_per_class = wts_per_class.tolist()
print(wts_per_class)

{1: 4.572507552870091, 2: 6.407630522088353, 3: 37.427083333333336, 4: 3.912117177097204, 5: 12.816479400749063, 6: 11.42087542087542, 7: 2.5, 8: 174.66666666666666, 9: 84.79069767441861}


In [12]:
param_grid = { 
    'n_estimators': [50, 200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_RF = RandomForestClassifier(random_state=0, class_weight='balanced')
clf = GridSearchCV(model_RF, param_grid=param_grid, scoring='neg_log_loss',cv=5,n_jobs=4)
clf.fit(df_train, y)
fname = '../cache/rf_multi_clf.pkl'
save_classifier(fname, clf)

In [13]:
print(clf.best_score_)
print(clf.best_params_)

# with my calculated weights
# -1.5031283568737968
# {'max_features': 'log2', 'n_estimators': 700}

# with balanced weights
# -1.4858419408875922
# {'max_features': 'auto', 'n_estimators': 700}

-1.4858419408875922
{'max_features': 'auto', 'n_estimators': 700}


In [14]:
pred_proba = clf.predict_proba(df_test)

In [31]:
pred_proba = pred_proba.clip(min=0.05, max=0.95)

In [34]:
print(pred_proba.shape)

(986, 9)


In [35]:
n_classes = np.unique(y)

In [36]:
df = pd.DataFrame(pred_proba)
l = []
for cls in range(len(n_classes)):
    l.append('class'+str(cls+1))
df.columns = l

In [37]:
df.shape

(986, 9)

In [38]:
df['ID'] = pid

In [39]:
df.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9,ID
0,0.17,0.157143,0.05,0.22,0.064286,0.084286,0.257143,0.05,0.05,1
1,0.414286,0.108571,0.05,0.128571,0.05,0.064286,0.22,0.05,0.05,2
2,0.185714,0.165714,0.05,0.232857,0.05,0.05,0.282857,0.05,0.05,3
3,0.192857,0.134286,0.05,0.237143,0.057143,0.052857,0.268571,0.05,0.05,4
4,0.155714,0.162857,0.05,0.15,0.054286,0.075714,0.371429,0.05,0.05,5


In [40]:
# df.to_csv('../submissions/sub_stage2_mop_rf.csv', index=False)
# scored 2.60615 on stage2 private LB and 1.61225 on stage2 public LB with weights


df.to_csv('../submissions/sub_stage2_wt_rf.csv', index=False)
# 2.29757 on stage2 private LB, 1.61310 on stag2 public LB

In [29]:
df1 = pd.DataFrame()
df1['ID'] = pid
df1.to_csv('../cache/test_id.csv', index=False)

In [30]:
df = pd.DataFrame()
df['y'] = y
df.to_csv('../cache/train_stage2_labels.csv', index=False)

In [44]:
X = df_train
Y = y
Z = df_test


In [45]:
Y1 = Y.T

In [46]:
y2 = shuffle(Y, random_state=1)
y3 = shuffle(Y, random_state=2)
y4 = shuffle(Y, random_state=3)
y5 = shuffle(Y, random_state=4)
y6 = shuffle(Y, random_state=5)
y7 = shuffle(Y, random_state=6)
y8 = shuffle(Y, random_state=7)
y9 = shuffle(Y, random_state=8)

Y1 = np.vstack((Y, y2, y3, y4, y5, y6, y7, y8, y9)).T

In [47]:
forest = RandomForestClassifier(n_estimators=700, max_features:'log2', random_state=1, class_weight=wts_per_class)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

In [48]:

preds = multi_target_forest.fit(X, Y1).predict(Z)

In [49]:
preds.shape

(986, 9)

In [50]:
print(preds)

[[7 7 7 ..., 7 7 7]
 [1 7 7 ..., 7 7 7]
 [7 7 7 ..., 7 7 7]
 ..., 
 [7 7 7 ..., 7 7 7]
 [4 7 7 ..., 7 7 7]
 [7 7 7 ..., 7 7 7]]


In [51]:
preds_proba = multi_target_forest.fit(X, Y1).predict_proba(Z)

In [52]:
preds_proba[0].shape

(986, 9)

In [53]:
preds_proba[0]

array([[ 0.176,  0.137,  0.025, ...,  0.247,  0.015,  0.021],
       [ 0.395,  0.138,  0.01 , ...,  0.204,  0.009,  0.011],
       [ 0.199,  0.153,  0.019, ...,  0.292,  0.021,  0.027],
       ..., 
       [ 0.161,  0.169,  0.018, ...,  0.326,  0.01 ,  0.009],
       [ 0.189,  0.138,  0.019, ...,  0.254,  0.012,  0.022],
       [ 0.193,  0.125,  0.016, ...,  0.302,  0.013,  0.027]])

In [54]:
preds_proba[0]

array([[ 0.176,  0.137,  0.025, ...,  0.247,  0.015,  0.021],
       [ 0.395,  0.138,  0.01 , ...,  0.204,  0.009,  0.011],
       [ 0.199,  0.153,  0.019, ...,  0.292,  0.021,  0.027],
       ..., 
       [ 0.161,  0.169,  0.018, ...,  0.326,  0.01 ,  0.009],
       [ 0.189,  0.138,  0.019, ...,  0.254,  0.012,  0.022],
       [ 0.193,  0.125,  0.016, ...,  0.302,  0.013,  0.027]])

In [55]:
df = pd.DataFrame(preds_proba[0], 
                  columns=['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9'])


In [56]:
df['ID'] = pid
# df.to_csv('../submissions/sub_stage2_multi_rf.csv', index=False)
df.to_csv('../submissions/sub_stage2_multi_rf_wts.csv', index=False)

In [None]:
# scored 2.64206 on stage2 private LB, 1.54953 on stage2 public LB without weights
# scored 2.60869 on stage2 private LB, 1.60590 on stage2 public LB with weights


In [114]:
save_classifier('../cache/stage2_multi_rf.pkl', multi_target_forest)

In [118]:
df.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9,ID
0,0.206,0.144,0.033,0.241,0.069,0.082,0.192,0.014,0.019,1
1,0.547,0.113,0.006,0.114,0.015,0.053,0.14,0.006,0.006,2
2,0.161,0.191,0.022,0.233,0.054,0.068,0.241,0.013,0.017,3
3,0.202,0.129,0.021,0.272,0.053,0.059,0.229,0.01,0.025,4
4,0.173,0.245,0.016,0.13,0.051,0.067,0.301,0.009,0.008,5


In [119]:
df.shape

(986, 10)

In [22]:
x1 = np.load('../cache/train_stage2_x1.npy')
x2 = np.load('../cache/train_stage2_x2.npy')
y1 = np.load('../cache/train_stage2_y1.npy')
y2 = np.load('../cache/train_stage2_y2.npy')

In [23]:
pd.value_counts(y2)

7    211
4    150
1    133
2    100
6     59
5     53
3     19
9      9
8      4
dtype: int64

In [24]:
y2.shape

(738,)

In [25]:
(2 * (221/738) *0.11)/((221/738) + 0.11) 

0.16089747832417764

In [29]:
w1 = np.array([wts_per_class[j] for j in y1], )
w2 = np.array([wts_per_class[j] for j in y2], )

clf1 = RandomForestClassifier(random_state=0, max_features='auto', n_estimators=700, class_weight='balanced')



In [30]:
clf1.fit(x1, y1)


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=700, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [32]:
test_preds = clf1.predict_proba(x2)
print(test_preds)

[[ 0.07        0.04285714  0.00428571 ...,  0.07714286  0.01142857
   0.65571429]
 [ 0.05714286  0.35428571  0.00714286 ...,  0.47571429  0.00285714  0.        ]
 [ 0.12285714  0.05142857  0.00428571 ...,  0.11428571  0.00142857
   0.00428571]
 ..., 
 [ 0.20428571  0.04857143  0.00857143 ...,  0.13142857  0.00142857  0.        ]
 [ 0.00428571  0.00142857  0.         ...,  0.00285714  0.          0.        ]
 [ 0.04        0.39285714  0.00285714 ...,  0.48714286  0.          0.        ]]


In [33]:
score2 = log_loss(y2, test_preds, labels = list(range(1,10)))
print('final multi_log_loss: {}'.format(score2))

fscore = f1_score(y2, test_preds.argmax(axis=1)+1, labels = list(range(1,10)), average='micro')
print('final f1_score: {}'.format(fscore))

acc = accuracy_score(y2, test_preds.argmax(axis=1)+1)
print('final accuracy: {}'.format(acc))

print(confusion_matrix(y2, test_preds.argmax(axis=1)+1, labels = list(range(1,10))))

final multi_log_loss: 0.9154469754860213
final f1_score: 0.6964769647696477
final accuracy: 0.6964769647696477
[[ 86   1   2  25  14   3   2   0   0]
 [  2  51   0   2   0   0  44   1   0]
 [  1   3   8   5   0   0   2   0   0]
 [ 17   0   1 119   1   1  11   0   0]
 [ 10   3   1   7  21   3   8   0   0]
 [  6   0   1   5   1  37   9   0   0]
 [  2   8   7   3   5   0 185   0   1]
 [  1   0   0   0   0   0   3   0   0]
 [  0   0   0   1   0   0   1   0   7]]


In [None]:
# with my weights
# final multi_log_loss: 1.0051730722455978
# final f1_score: 0.7073170731707317
# final accuracy: 0.7073170731707317
# [[ 97   0   1  19   5   1   6   0   0]
#  [  4  43   0   1   0   2  56   0   0]
#  [  0   0   4   6   1   0  13   0   0]
#  [ 12   1   1 114   0   0  13   0   0]
#  [ 16   1   0   4  26   2   5   0   0]
#  [  8   0   0   2   2  34   9   0   0]
#  [  1  13   2   3   2   0 200   0   0]
#  [  0   0   0   0   0   0   2   0   0]
#  [  0   0   0   0   0   0   2   0   4]]

# with 'balanced' weights
# final multi_log_loss: 0.9154469754860213
# final f1_score: 0.6964769647696477
# final accuracy: 0.6964769647696477
# [[ 86   1   2  25  14   3   2   0   0]
#  [  2  51   0   2   0   0  44   1   0]
#  [  1   3   8   5   0   0   2   0   0]
#  [ 17   0   1 119   1   1  11   0   0]
#  [ 10   3   1   7  21   3   8   0   0]
#  [  6   0   1   5   1  37   9   0   0]
#  [  2   8   7   3   5   0 185   0   1]
#  [  1   0   0   0   0   0   3   0   0]
#  [  0   0   0   1   0   0   1   0   7]]