In [1]:
import numpy as np
import pandas as pd

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
  
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV

from sklearn.grid_search import GridSearchCV

from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB

from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_false
from sklearn.utils.testing import assert_raises
  
from sklearn.utils.testing import assert_greater
import pickle

from sklearn.metrics import log_loss, confusion_matrix, f1_score, accuracy_score

from utils import multi_log_loss



In [2]:
df_train = np.load('../cache/train_stage2_fe2.npy')
df_test = np.load('../cache/test_stage2_fe2.npy')

df = pd.read_csv('../cache/stage2_labels.csv')
y = df['y'].values

df = pd.read_csv('../cache/stage2_test_id.csv')
pid = df.ID.values

In [3]:

# load the stage2 split data for CV
x1 = np.load('../cache/train_stage2_x1.npy')
x2 = np.load('../cache/train_stage2_x2.npy')
y1 = np.load('../cache/train_stage2_y1.npy')
y2 = np.load('../cache/train_stage2_y2.npy')

# load the weights per class
wts_per_class = np.load('../cache/stage2_train_weights_per_class.npy')
wts_per_class = wts_per_class.tolist()
print(wts_per_class)

# get sample weight for x1, x2 and df_train
w1 = np.load('../cache/stage2_x1_weights_per_class.npy').tolist()
w2 = np.load('../cache/stage2_x2_weights_per_class.npy').tolist()
w = np.array([wts_per_class[j] for j in y], )

print('\n')
print(w1)
print('\n')
print(w2)

{1: 4.572507552870091, 2: 6.407630522088353, 3: 37.427083333333336, 4: 3.912117177097204, 5: 12.816479400749063, 6: 11.42087542087542, 7: 2.5, 8: 174.66666666666666, 9: 84.79069767441861}


{1: 4.578449905482041, 2: 6.414572864321608, 3: 37.324675324675326, 4: 3.9101497504159735, 5: 12.789719626168225, 6: 11.399159663865547, 7: 2.5005931198102016, 8: 172.58823529411765, 9: 85.79411764705883}


{1: 4.548872180451128, 2: 6.38, 3: 37.8421052631579, 4: 3.92, 5: 12.924528301886792, 6: 11.508474576271187, 7: 2.4976303317535544, 8: 183.5, 9: 81.0}


In [35]:
y3 = y1 -1

In [39]:
w1_prime = {}
for i in y3:
    w1_prime[i] = w1[i+1] 

In [6]:
df_train.shape

(3689, 4689)

In [23]:
pd.value_counts(y)

7    1054
4     751
1     662
2     498
6     297
5     267
3      96
9      43
8      21
dtype: int64

In [24]:
2 * (1054/3689) * 0.11/ ((1054/3689) + 0.11)

0.15884476534296027

In [26]:
y1.shape

(2951,)

In [25]:
pd.value_counts(y1)

7    843
4    601
1    529
2    398
6    238
5    214
3     77
9     34
8     17
dtype: int64

In [27]:
2 * (843/2951) * 0.11/ ((843/2951) + 0.11)

0.1588372829968911

In [29]:
y2.shape

(738,)

In [28]:
pd.value_counts(y2)

7    211
4    150
1    133
2    100
6     59
5     53
3     19
9      9
8      4
dtype: int64

In [30]:
2 * (211/738) * 0.11/ ((211/738) + 0.11)

0.1588746663015949

In [9]:
df_test.shape

(986, 4689)

In [11]:
ovo = OneVsOneClassifier(svm.SVC(probability=True))
Cs = [0.01, 0.02, 0.05, 0.09]
cv = GridSearchCV(ovo, {'estimator__C': Cs})
cv.fit(df_train, y)
best_C = cv.best_estimator_.estimators_[0].C
assert_true(best_C in Cs)


In [12]:
print(best_C)

0.01


In [42]:
ovo = CalibratedClassifierCV(OneVsOneClassifier(svm.SVC(probability=True, C=0.01, class_weight='balanced')))

In [43]:
ovo.fit(x1,y3)

CalibratedClassifierCV(base_estimator=OneVsOneClassifier(estimator=SVC(C=0.01, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1),
            cv=3, method='sigmoid')

In [44]:
test_preds = ovo.predict_proba(x2)

In [45]:
score2 = log_loss(y2, test_preds, labels = range(1,10))
print('stage2 CV multi_log_loss: {}'.format(score2))

fscore = f1_score(y2, test_preds.argmax(axis=1)+1, labels = list(range(1,10)), average='micro')
print('stage2 CV f1_score: {}'.format(fscore))

acc = accuracy_score(y2, test_preds.argmax(axis=1)+1)
print('stage2 CV accuracy: {}'.format(acc))

print(confusion_matrix(y2, test_preds.argmax(axis=1)+1, labels = list(range(1,10))))

stage1 partial multi_log_loss: 1.8288854205271148
stage1 partial f1_score: 0.2859078590785908
stage1 partial accuracy: 0.2859078590785908
[[  0   0   0   0   0   0 133   0   0]
 [  0   0   0   0   0   0 100   0   0]
 [  0   0   0   0   0   0  19   0   0]
 [  0   0   0   0   0   0 150   0   0]
 [  0   0   0   0   0   0  53   0   0]
 [  0   0   0   0   0   0  59   0   0]
 [  0   0   0   0   0   0 211   0   0]
 [  0   0   0   0   0   0   4   0   0]
 [  0   0   0   0   0   0   9   0   0]]


In [None]:
# now prepare for submission

In [46]:
ovo = CalibratedClassifierCV(OneVsOneClassifier(svm.SVC(probability=True, C=0.01, class_weight='balanced')))

In [47]:
ovo.fit(df_train, y)

CalibratedClassifierCV(base_estimator=OneVsOneClassifier(estimator=SVC(C=0.01, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1),
            cv=3, method='sigmoid')

In [48]:
test_preds = ovo.predict_proba(df_test)

In [49]:
test_preds = test_preds.clip(min=0.05, max=0.95)

In [50]:
df = pd.read_csv('../cache/stage2_test_id.csv')
pid = df.ID.values

In [51]:
submission = pd.DataFrame(test_preds, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/sub_ovo.csv', index=False)
# 2.31770 on stage2 private LB, 1.91969 on stage2 public LB


In [85]:
np.unique(df['y'].values)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [53]:
clf = OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(class_weight='balanced'))).fit(df_train, y)

In [54]:
test_preds = clf.predict_proba(df_test)

In [55]:
print(test_preds)

[[ 0.17062396  0.12593915  0.03510122 ...,  0.24114183  0.00508214
   0.01081849]
 [ 0.18580295  0.12297066  0.02032834 ...,  0.28548666  0.00560611
   0.01183235]
 [ 0.18561762  0.15604937  0.02398364 ...,  0.29236309  0.00617912
   0.01146768]
 ..., 
 [ 0.14385532  0.1336551   0.02099906 ...,  0.30487442  0.00554094
   0.01071901]
 [ 0.21522322  0.14351413  0.0331304  ...,  0.27870575  0.00533352
   0.00924263]
 [ 0.14311539  0.13092741  0.02286252 ...,  0.3078373   0.0052362
   0.00966382]]


In [56]:
test_preds = test_preds.clip(min=0.05, max=0.95)

In [57]:
submission = pd.DataFrame(test_preds, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/sub_ovr.csv', index=False)
# 2.31000 on stage2 private LB, 1.89228 on stage2 public LB

In [58]:
clf = OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(class_weight='balanced'))).fit(x1, y1)

In [59]:
test_preds = clf.predict_proba(x2)

In [60]:
test_preds = test_preds.clip(min=0.05, max=0.95)

In [61]:
score2 = log_loss(y2, test_preds, labels=[1,2,3,4,5,6,7,8,9])
print('stage2 CV multi_log_loss: {}'.format(score2))

fscore = f1_score(y2, test_preds.argmax(axis=1)+1, labels = list(range(1,10)), average='micro')
print('stage2 CV f1_score: {}'.format(fscore))

acc = accuracy_score(y2, test_preds.argmax(axis=1)+1)
print('stage2 CV accuracy: {}'.format(acc))

print(confusion_matrix(y2, test_preds.argmax(axis=1)+1, labels = list(range(1,10))))

stage2 CV multi_log_loss: 1.8493883574649366
stage2 CV f1_score: 0.2967479674796748
stage2 CV accuracy: 0.2967479674796748
[[  0   0   0  12   0   0 121   0   0]
 [  0   0   0   5   0   0  95   0   0]
 [  0   0   0   1   0   0  18   0   0]
 [  0   0   0  18   0   0 132   0   0]
 [  0   0   0   2   0   0  51   0   0]
 [  0   0   0   1   0   0  58   0   0]
 [  0   0   0  10   0   0 201   0   0]
 [  0   0   0   0   0   0   4   0   0]
 [  0   0   0   0   0   0   9   0   0]]


In [6]:
Y = y

In [7]:
nb = MultinomialNB()
Y_pred =nb.fit(abs(df_train), Y).predict_proba(df_test)

In [8]:
Y_pred = Y_pred.clip(min=0.05, max=0.95)

In [9]:
submission = pd.DataFrame(Y_pred, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/sub_stage2_nb.csv', index=False)

In [None]:
# now CV scores

In [11]:
Y_pred =nb.fit(abs(x1), y1).predict_proba(x2)

In [12]:
score2 = log_loss(y2, Y_pred, labels=range(1,10))
print('stage2 NB CV multi_log_loss: {}'.format(score2))

fscore = f1_score(y2, Y_pred.argmax(axis=1)+1, labels = list(range(1,10)), average='micro')
print('stage2 NB CV  f1_score: {}'.format(fscore))

acc = accuracy_score(y2, Y_pred.argmax(axis=1)+1)
print('stage2 NB CV accuracy: {}'.format(acc))

print(confusion_matrix(y2, Y_pred.argmax(axis=1)+1, labels = list(range(1,10))))

stage2 CV multi_log_loss: 29.930365161230498
stage2 CV f1_score: 0.13008130081300814
stage2 CV accuracy: 0.13008130081300814
[[ 9  2 37 10 15  6  3 43  8]
 [ 1  7 31  3  7 11  9 28  3]
 [ 1  0 12  0  1  0  1  4  0]
 [ 8  0 64 11  6  5  2 38 16]
 [ 4  1 17  2 10  1  2 13  3]
 [ 2  4 12  2  5 14  3 14  3]
 [ 9  9 53  5  7 17 28 81  2]
 [ 1  0  1  0  0  0  0  2  0]
 [ 1  0  3  0  0  0  0  2  3]]
