In [52]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather
from sklearn.base import BaseEstimator as be
from sklearn.base import TransformerMixin as tm
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import pickle


In [50]:
def save_classifier(fname, clf):
    # save the classifier
    with open(fname, 'wb') as fid:
        pickle.dump(clf, fid)

def load_classifier(fname):
    # load it again
    with open(fname, 'rb') as fid:
        clf = pickle.load(fid)
        return clf

In [2]:
# Any results you write to the current directory are saved as output.
submission = pd.read_csv('../data/stage2_sample_submission.csv')
stage1_test = pd.read_csv('../data/test_variants')
stage2_test = pd.read_csv('../data/stage2_test_variants.csv')
stage1_solution = pd.read_csv('../data/stage1_solution_filtered.csv')

stage1_solution = stage1_solution.merge(stage1_test, how = 'left', on = 'ID')



In [3]:
df_train = feather.read_dataframe('../cache/train_stage2_fe.feather')

In [4]:
train_ids = df_train.ID.values

In [5]:
df_train = df_train.drop('ID', axis=1)

In [6]:
y = df_train['Class'].values

In [7]:
df_test = feather.read_dataframe('../cache/test_stage2_fe.feather')
pid = df_test.ID
df_test = df_test.drop('ID', axis=1)
# df_test['Class'] = y_hack

In [8]:
df_train.shape

(3689, 3590)

In [9]:
df_train.head()

Unnamed: 0,Gene,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,Gene_0,...,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,FAM58A,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,5,...,1,39672,6105,3460,20,2,1654,27,3,1
1,CBL,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,2,...,1,36691,5783,3748,5,1,941,9,2,2
2,CBL,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,2,...,1,36691,5783,2425,5,1,933,9,2,2
3,CBL,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,2,...,1,36238,5625,2132,5,1,929,9,2,3
4,CBL,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,2,...,1,41308,6248,1854,5,1,927,9,2,4


In [10]:
df_test = feather.read_dataframe('../cache/test_stage2_fe.feather')

In [11]:
df_test.shape

(986, 3590)

In [12]:
df_test.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
0,CHEK2,1,The incidence of breast cancer is increasing i...,H371Y,CHEK2 H371Y,1,1,2,5,5,...,5,1,33403,4991,1379,5,1,1080,11,2
1,AXIN2,2,An unselected series of 310 colorectal carcino...,Truncating Mutations,AXIN2 Truncating Mutations,1,1,2,5,20,...,5,1,66400,10348,3460,20,2,286,26,3
2,WNT4,3,Mycosis fungoides and Sézary syndrome are prim...,E216G,WNT4 E216G,0,1,1,4,5,...,4,1,58544,8638,672,5,1,4287,10,2
3,SUCLA2,4,Regulated progression through the cell cycle ...,G118R,SUCLA2 G118R,0,1,1,6,5,...,6,1,42023,6221,1038,5,1,3841,12,2
4,BRAF,5,Pilocytic astrocytoma (PA) is emerging as a tu...,T599insTT,BRAF T599insTT,1,0,1,4,9,...,4,1,22499,3280,3402,9,1,430,14,2


In [13]:
########################################
## process texts in datasets
########################################


# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    my_stopwords = [
        "fig", "figure", "et", "al", "table",
        "data", "analysis", "analyze", "study",
        "method", "result", "conclusion", "author",
        "find", "found", "show", "perform",
        "demonstrate", "evaluate", "discuss"
    ]
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = [w for w in text if not w in my_stopwords]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [14]:
# https://www.kaggle.com/the1owl/redefining-treatment-0-57456
class cust_regression_vals(be, tm):
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x = x.drop(['Gene', 'Variation', 'ID','Text', 'GeneVar'],axis=1).values
        return x

class cust_txt_col(be, tm):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        return x[self.key].apply(str)


In [15]:
# from https://www.kaggle.com/the1owl/redefining-treatment-0-57456
fp = Pipeline([
    ('union', FeatureUnion(
        n_jobs = -1,
        transformer_list = [
            ('standard', cust_regression_vals()),
            ('pi1', Pipeline([('Gene', cust_txt_col('Gene')), 
                                       ('count_Gene', CountVectorizer(analyzer=u'char',ngram_range=(1, 3))), 
                                       ('tsvd1', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
            ('pi2', Pipeline([('Variation', cust_txt_col('Variation')), 
                                       ('count_Variation', CountVectorizer(analyzer=u'char',ngram_range=(1, 3))), 
                                       ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
            ('pi3', Pipeline([('GeneVar', cust_txt_col('GeneVar')), 
                                       ('count_GeneVar', CountVectorizer(analyzer=u'char', ngram_range=(1, 3))), 
                                       ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, 
                                                                            random_state=12))])),
            ('pi4',Pipeline([('Text', cust_txt_col('Text')), 
                                       ('hv', HashingVectorizer(decode_error='ignore', 
                                                                                        n_features=2 ** 16,
                                                                                        non_negative=True, 
                                                                                        ngram_range=(1, 5))),
                                       ('tfidf_Text', TfidfTransformer()), 
                                       ('tsvd3', TruncatedSVD(n_components=300, n_iter=25, 
                                                                            random_state=12))]))

        
        ])
    )])


In [16]:
df_train['Text'] = [text_to_wordlist(w) for w in df_train['Text'].values]

In [17]:
df_test['Text'] = [text_to_wordlist(w) for w in df_test['Text'].values]

In [18]:
df_train.head()

Unnamed: 0,Gene,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,Gene_0,...,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,FAM58A,cyclin - depend kinas cdks regul varieti funda...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,5,...,1,39672,6105,3460,20,2,1654,27,3,1
1,CBL,abstract background non - small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,2,...,1,36691,5783,3748,5,1,941,9,2,2
2,CBL,abstract background non - small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,2,...,1,36691,5783,2425,5,1,933,9,2,2
3,CBL,recent evid demonstr acquir uniparent disomi a...,N454D,CBL N454D,1,1,2,3,5,2,...,1,36238,5625,2132,5,1,929,9,2,3
4,CBL,oncogen mutat monomer casita b - lineag lympho...,L399V,CBL L399V,1,1,2,3,5,2,...,1,41308,6248,1854,5,1,927,9,2,4


In [19]:
y = df_train['Class'].values

In [20]:
df_train['ID'] = [i for i in range(df_train.shape[0])]

In [21]:
tr = df_train
te = df_test

In [22]:
# y = tr['Class'].values

In [23]:
set(te.columns) - set(tr.columns)

set()

In [24]:
df_train = df_train.drop('Class',axis=1)

In [25]:
# df_train = df_train.drop('ID', axis=1)
# df_test = df_test.drop('ID', axis=1)

In [26]:
# df_train = tr

In [27]:
df_train.head()

Unnamed: 0,Gene,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,Gene_0,...,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,ID
0,FAM58A,cyclin - depend kinas cdks regul varieti funda...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,5,...,1,39672,6105,3460,20,2,1654,27,3,0
1,CBL,abstract background non - small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,2,...,1,36691,5783,3748,5,1,941,9,2,1
2,CBL,abstract background non - small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,2,...,1,36691,5783,2425,5,1,933,9,2,2
3,CBL,recent evid demonstr acquir uniparent disomi a...,N454D,CBL N454D,1,1,2,3,5,2,...,1,36238,5625,2132,5,1,929,9,2,3
4,CBL,oncogen mutat monomer casita b - lineag lympho...,L399V,CBL L399V,1,1,2,3,5,2,...,1,41308,6248,1854,5,1,927,9,2,4


In [28]:
df_train.shape

(3689, 3590)

In [29]:
df_train = fp.fit_transform(df_train)
print (df_train.shape)




(3689, 3945)


In [30]:
df_test = fp.fit_transform(df_test)
print (df_test.shape)



(986, 3945)


In [31]:
y = y - 1 #fix for zero bound array

In [32]:
print(y.shape)

(3689,)


In [33]:
np.unique(y)

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [34]:
xgb_model = xgb.XGBClassifier(objective='multi:softprob')
xgbParams = xgb_model.get_xgb_params()
xgbParams['num_class'] = 9
parameters = {
              'learning_rate': [0.02, 0.01, 0.03], #so called `eta` value
              'max_depth': [5,6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'missing':[-999],
              'seed': [1337]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=5,
                   scoring = 'neg_log_loss',
                   verbose=2, refit=True)

clf.fit(df_train, y)

# denom = 0
# fold = 10 
# for i in range(fold):
#     params = {
#         'eta': 0.02,
#         'max_depth': 5,
#         'objective': 'multi:softprob',
#         'eval_metric': 'mlogloss',
#         'num_class': 9,
#         'seed': i,
#         'silent': True
#     }
#     x1, x2, y1, y2 = train_test_split(df_train, y, test_size=0.2, random_state=i)
#     watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
#     model = xgb.train(params, xgb.DMatrix(x1, y1), 1000,  watchlist, verbose_eval=50, early_stopping_rounds=100)
#     score1 = log_loss(y2, model.predict(xgb.DMatrix(x2), 
#                                                 ntree_limit=model.best_ntree_limit), labels = list(range(9)))
#     print(score1)
#     #if score < 0.9:
#     if denom != 0:
#         pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
#         preds += pred
#     else:
#         pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
#         preds = pred.copy()
#     denom += 1
#     submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
#     submission['ID'] = pid
#     submission.to_csv('../submissions/submission2_3_xgb_fold_'  + str(i) + '.csv', index=False)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] colsample_bytree=0.7, learning_rate=0.02, max_depth=5, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.02, max_depth=5, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.02, max_depth=5, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.02, max_depth=5, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.02, max_depth=5, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.02, max_depth=5, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 - 3.9min
[CV] colsample_bytree=0.7, learning_rate=0.02, max_depth=6, min_child_weight=11

[CV]  colsample_bytree=0.7, learning_rate=0.03, max_depth=6, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 - 4.7min
[CV]  colsample_bytree=0.7, learning_rate=0.03, max_depth=6, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 - 4.7min
[CV]  colsample_bytree=0.7, learning_rate=0.03, max_depth=6, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 - 4.6min
[CV]  colsample_bytree=0.7, learning_rate=0.03, max_depth=6, min_child_weight=11, missing=-999, nthread=4, seed=1337, silent=1, subsample=0.8 - 4.6min


[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed: 26.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'nthread': [4], 'learning_rate': [0.02, 0.01, 0.03], 'max_depth': [5, 6], 'min_child_weight': [11], 'silent': [1], 'subsample': [0.8], 'colsample_bytree': [0.7], 'missing': [-999], 'seed': [1337]},
       pre_dispatch='2*n_jobs', refit=True, scoring='neg_log_loss',
       verbose=2)

In [35]:
print(clf.best_score_)
print(clf.best_params_)

-1.4032872189073649
{'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 11, 'missing': -999, 'nthread': 4, 'seed': 1337, 'silent': 1, 'subsample': 0.8}


In [36]:
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
# print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))



colsample_bytree: 0.7
learning_rate: 0.03
max_depth: 6
min_child_weight: 11
missing: -999
nthread: 4
seed: 1337
silent: 1
subsample: 0.8


In [46]:
test_probs = clf.predict_proba(df_test)

In [47]:
test_probs.shape

(986, 9)

In [48]:
print(test_probs)

[[ 0.268085    0.06699839  0.06246453 ...,  0.07700725  0.04654521
   0.04602181]
 [ 0.71335578  0.04182271  0.02163256 ...,  0.04632065  0.02227977
   0.0228263 ]
 [ 0.08334687  0.1219355   0.046665   ...,  0.25016487  0.0468474
   0.04011207]
 ..., 
 [ 0.13972722  0.17756784  0.09964821 ...,  0.13103653  0.04473274
   0.0537799 ]
 [ 0.08828997  0.10845774  0.0482584  ...,  0.18114112  0.04234995
   0.04439981]
 [ 0.19453251  0.18151     0.04000674 ...,  0.17013605  0.04449223
   0.09882276]]


In [49]:
submission = pd.DataFrame(test_probs, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/subm_xgb_stage2_with_gridcv.csv', index=False)


In [None]:
# scored 1.70423 on the public leaderboard, 2.40243 on the private leaderboard

In [53]:
save_classifier('../cache/xgb_clf.pkl', clf)