In [1]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather
from sklearn.base import BaseEstimator as be
from sklearn.base import TransformerMixin as tm
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

from sklearn.utils import shuffle

import pickle



In [2]:
def save_classifier(fname, clf):
    # save the classifier
    with open(fname, 'wb') as fid:
        pickle.dump(clf, fid)

def load_classifier(fname):
    # load it again
    with open(fname, 'rb') as fid:
        clf = pickle.load(fid)
        return clf

In [4]:
df_train = np.load('../cache/train_stage1_fe2.npy')

In [5]:
df_test = np.load('../cache/test_stage1_fe2.npy')

In [6]:
df = pd.read_csv('../cache/stage1_labels.csv')

In [9]:
y = df['y'].values

In [10]:
df1 = pd.read_csv('../cache/stage1_test_id.csv')
test_id = df1['ID'].values

In [42]:
len(test_id)

5668

In [11]:
print(y.shape)

(3321,)


In [12]:
np.unique(y)

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [13]:

param_grid = { 
    'n_estimators': [50, 200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

for cls in [0,1,2,3,4,5,6,7,8]:
    y_this_class = [1 if cls==i else 0 for i in y]
#     print(y_this_class)
    model_RF = RandomForestClassifier(random_state=0)
    clf = GridSearchCV(model_RF, param_grid=param_grid, cv=5,n_jobs=4)
    clf.fit(df_train, y_this_class )
    fname = '../cache/stage1_rf_cls_' + str(cls) + '.pkl'
    save_classifier(fname, clf)

In [14]:
n_classes = np.unique(y)

In [15]:
df = pd.DataFrame()
for cls in n_classes:
    fname = '../cache/stage1_rf_cls_' + str(cls) + '.pkl'
    clf = load_classifier(fname)
    print(clf.best_score_)
    print(clf.best_params_)
    test_probs = clf.predict_proba(df_test)[:,1]
    df['class' + str(cls+1)] = test_probs

0.7961457392351702
{'max_features': 'log2', 'n_estimators': 200}
0.8551641071966275
{'max_features': 'log2', 'n_estimators': 50}
0.9433905450165613
{'max_features': 'log2', 'n_estimators': 50}
0.7500752785305631
{'max_features': 'log2', 'n_estimators': 50}
0.8798554652213189
{'max_features': 'log2', 'n_estimators': 200}
0.9337548931044866
{'max_features': 'log2', 'n_estimators': 50}
0.6967780788919
{'max_features': 'log2', 'n_estimators': 50}
0.9948810599217103
{'max_features': 'log2', 'n_estimators': 50}
0.9894610057211684
{'max_features': 'auto', 'n_estimators': 200}


In [16]:
df['ID'] = test_id

In [17]:
df.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9,ID
0,0.105,0.18,0.0,0.1,0.03,0.06,0.32,0.02,0.0,0
1,0.155,0.18,0.06,0.16,0.09,0.1,0.34,0.0,0.005,1
2,0.12,0.18,0.0,0.1,0.035,0.04,0.26,0.02,0.005,2
3,0.075,0.14,0.0,0.12,0.045,0.04,0.18,0.0,0.005,3
4,0.14,0.18,0.02,0.4,0.055,0.06,0.32,0.0,0.005,4


In [29]:
df.to_csv('../submissions/submission_stage1_rf.csv', index=False)


In [None]:
# scored 2.18400 on stage2 public leaderboard, and 8.10579 on private leaderboard


In [43]:
df_train.shape

(3321, 3639)

In [19]:
X = df_train
Y = y
Z = df_test


In [34]:
X.shape

(3321, 3639)

In [35]:
Y.shape

(3321,)

In [36]:
Z.shape

(5668, 3639)

In [20]:
y2 = shuffle(Y, random_state=1)
y3 = shuffle(Y, random_state=2)
y4 = shuffle(Y, random_state=3)
y5 = shuffle(Y, random_state=4)
y6 = shuffle(Y, random_state=5)
y7 = shuffle(Y, random_state=6)
y8 = shuffle(Y, random_state=7)
y9 = shuffle(Y, random_state=8)

Y1 = np.vstack((Y, y2, y3, y4, y5, y6, y7, y8, y9)).T

In [21]:
forest = RandomForestClassifier(n_estimators=1000, random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

In [22]:

preds = multi_target_forest.fit(X, Y1).predict(Z)

In [23]:
preds.shape

(5668, 9)

In [24]:
print(preds)

[[6 6 6 ..., 6 6 6]
 [6 6 6 ..., 6 6 6]
 [6 6 6 ..., 6 6 6]
 ..., 
 [6 6 6 ..., 6 6 6]
 [6 6 6 ..., 6 6 6]
 [6 6 6 ..., 6 6 6]]


In [25]:
preds_proba = multi_target_forest.fit(X, Y1).predict_proba(Z)

In [45]:
preds_proba[0].shape

(5668, 9)

In [48]:
df = pd.DataFrame(preds_proba[0], 
                  columns=['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9'])


In [49]:
df.shape

(5668, 9)

In [50]:
df['ID'] = test_id
df.to_csv('../submissions/sub_stage1_multi_rf.csv', index=False)

In [51]:
df.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9,ID
0,0.153,0.201,0.024,0.159,0.073,0.076,0.296,0.011,0.007,0
1,0.151,0.167,0.023,0.231,0.051,0.066,0.286,0.01,0.015,1
2,0.106,0.2,0.034,0.152,0.062,0.045,0.384,0.008,0.009,2
3,0.116,0.179,0.017,0.188,0.057,0.053,0.37,0.011,0.009,3
4,0.173,0.142,0.026,0.273,0.064,0.065,0.241,0.008,0.008,4


In [52]:
save_classifier('../cache/stage1_multi_rf.pkl', multi_target_forest)