In [2]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather
from sklearn.base import BaseEstimator as be
from sklearn.base import TransformerMixin as tm
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [3]:
df_train = feather.read_dataframe('../cache/train_stage2_fe.feather')

In [4]:
df_train.shape

(3689, 3591)

In [6]:
df_train.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,1,39672,6105,3460,20,2,1654,27,3,1
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,1,36691,5783,3748,5,1,941,9,2,2
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,1,36691,5783,2425,5,1,933,9,2,2
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,1,36238,5625,2132,5,1,929,9,2,3
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,1,41308,6248,1854,5,1,927,9,2,4


In [7]:
df_test = feather.read_dataframe('../cache/test_stage2_fe.feather')

In [8]:
df_test.shape

(986, 3590)

In [9]:
df_test.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
0,CHEK2,1,The incidence of breast cancer is increasing i...,H371Y,CHEK2 H371Y,1,1,2,5,5,...,5,1,33403,4991,1379,5,1,1080,11,2
1,AXIN2,2,An unselected series of 310 colorectal carcino...,Truncating Mutations,AXIN2 Truncating Mutations,1,1,2,5,20,...,5,1,66400,10348,3460,20,2,286,26,3
2,WNT4,3,Mycosis fungoides and Sézary syndrome are prim...,E216G,WNT4 E216G,0,1,1,4,5,...,4,1,58544,8638,672,5,1,4287,10,2
3,SUCLA2,4,Regulated progression through the cell cycle ...,G118R,SUCLA2 G118R,0,1,1,6,5,...,6,1,42023,6221,1038,5,1,3841,12,2
4,BRAF,5,Pilocytic astrocytoma (PA) is emerging as a tu...,T599insTT,BRAF T599insTT,1,0,1,4,9,...,4,1,22499,3280,3402,9,1,430,14,2


In [10]:
pid = df_test.ID

In [11]:
########################################
## process texts in datasets
########################################


# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    my_stopwords = [
        "fig", "figure", "et", "al", "table",
        "data", "analysis", "analyze", "study",
        "method", "result", "conclusion", "author",
        "find", "found", "show", "perform",
        "demonstrate", "evaluate", "discuss"
    ]
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = [w for w in text if not w in my_stopwords]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [12]:
class cust_regression_vals(be, tm):
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x = x.drop(['Gene', 'Variation', 'ID','Text', 'GeneVar'],axis=1).values
        return x

class cust_txt_col(be, tm):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        return x[self.key].apply(str)


In [13]:

fp = Pipeline([
    ('union', FeatureUnion(
        n_jobs = -1,
        transformer_list = [
            ('standard', cust_regression_vals()),
            ('pi1', Pipeline([('Gene', cust_txt_col('Gene')), 
                                       ('count_Gene', CountVectorizer(analyzer=u'char',ngram_range=(1, 3))), 
                                       ('tsvd1', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
            ('pi2', Pipeline([('Variation', cust_txt_col('Variation')), 
                                       ('count_Variation', CountVectorizer(analyzer=u'char',ngram_range=(1, 3))), 
                                       ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
            ('pi3', Pipeline([('GeneVar', cust_txt_col('GeneVar')), 
                                       ('count_GeneVar', CountVectorizer(analyzer=u'char', ngram_range=(1, 3))), 
                                       ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, 
                                                                            random_state=12))])),
            ('pi4',Pipeline([('Text', cust_txt_col('Text')), 
                                       ('hv', HashingVectorizer(decode_error='ignore', 
                                                                                        n_features=2 ** 16,
                                                                                        non_negative=True, 
                                                                                        ngram_range=(1, 5))),
                                       ('tfidf_Text', TfidfTransformer()), 
                                       ('tsvd3', TruncatedSVD(n_components=300, n_iter=25, 
                                                                            random_state=12))]))

        
        ])
    )])


In [14]:
df_train['Text'] = [text_to_wordlist(w) for w in df_train['Text'].values]

In [15]:
df_test['Text'] = [text_to_wordlist(w) for w in df_test['Text'].values]

In [16]:
df_train.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,FAM58A,0,cyclin - depend kinas cdks regul varieti funda...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,1,39672,6105,3460,20,2,1654,27,3,1
1,CBL,1,abstract background non - small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,1,36691,5783,3748,5,1,941,9,2,2
2,CBL,2,abstract background non - small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,1,36691,5783,2425,5,1,933,9,2,2
3,CBL,3,recent evid demonstr acquir uniparent disomi a...,N454D,CBL N454D,1,1,2,3,5,...,1,36238,5625,2132,5,1,929,9,2,3
4,CBL,4,oncogen mutat monomer casita b - lineag lympho...,L399V,CBL L399V,1,1,2,3,5,...,1,41308,6248,1854,5,1,927,9,2,4


In [17]:
y = df_train['Class'].values

In [18]:
tr = df_train
te = df_test

In [19]:
# y = tr['Class'].values

In [20]:
set(te.columns) - set(tr.columns)

set()

In [21]:
df_train = df_train.drop('Class',axis=1)

In [22]:
# df_train = df_train.drop('ID', axis=1)
# df_test = df_test.drop('ID', axis=1)

In [23]:
# df_train = tr

In [24]:
df_train = fp.fit_transform(df_train)
print (df_train.shape)




(3689, 3945)


In [25]:
df_test = fp.fit_transform(df_test)
print (df_test.shape)



(986, 3945)


In [26]:
y = y - 1 #fix for zero bound array

In [27]:
 

denom = 0
fold = 10 
for i in range(fold):
    params = {
        'eta': 0.03333,
        'max_depth': 6,
        'subsample' : 0.8,
        'colsample_bytree':0.8,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 9,
        'seed': i,
        'silent': True
    }
    x1, x2, y1, y2 = train_test_split(df_train, y, test_size=0.2, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 1000,  watchlist, verbose_eval=50, early_stopping_rounds=100)
    score1 = log_loss(y2, model.predict(xgb.DMatrix(x2), 
                                                ntree_limit=model.best_ntree_limit), labels = list(range(9)))
    print(score1)
    #if score < 0.9:
    if denom != 0:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds += pred
    else:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds = pred.copy()
    denom += 1
    submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    submission['ID'] = pid
    submission.to_csv('../submissions/submission1_2_2_xgb_fold_'  + str(i) + '.csv', index=False)


[0]	train-mlogloss:2.12645	valid-mlogloss:2.14067
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.814166	valid-mlogloss:1.15291
[100]	train-mlogloss:0.452439	valid-mlogloss:0.960481
[150]	train-mlogloss:0.280237	valid-mlogloss:0.901279
[200]	train-mlogloss:0.181032	valid-mlogloss:0.890509
[250]	train-mlogloss:0.121059	valid-mlogloss:0.894627
Stopping. Best iteration:
[184]	train-mlogloss:0.208369	valid-mlogloss:0.890166

0.89016613473
[0]	train-mlogloss:2.12807	valid-mlogloss:2.13738
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.812295	valid-mlogloss:1.09682
[100]	train-mlogloss:0.455408	valid-mlogloss:0.883845
[150]	train-mlogloss:0.284691	valid-mlogloss:0.814417
[200]	train-mlogloss:0.185225	valid-mlogloss:0.795225
[250]	train-mlo

In [28]:
submission = pd.DataFrame(preds/denom, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/submission_all_2_2_xgb.csv', index=False)


In [31]:
len(y)

3689

In [33]:
y

array([0, 1, 1, ..., 5, 3, 0])

In [29]:
y_pseudo = preds/denom

In [32]:
len(y_pseudo)

986

In [34]:
y_pseudo

array([[ 0.36826095,  0.03778919,  0.02021655, ...,  0.03634918,
         0.00809619,  0.00777789],
       [ 0.86791885,  0.02721421,  0.00515629, ...,  0.02514629,
         0.0041679 ,  0.0027892 ],
       [ 0.07454448,  0.09149207,  0.01409115, ...,  0.35999787,
         0.01568171,  0.01071824],
       ..., 
       [ 0.16088469,  0.2002444 ,  0.06238246, ...,  0.09231327,
         0.01163716,  0.00876702],
       [ 0.09382014,  0.17505959,  0.01044971, ...,  0.25275773,
         0.0062432 ,  0.00909903],
       [ 0.27108249,  0.24305496,  0.01503516, ...,  0.14290585,
         0.01782902,  0.00903406]], dtype=float32)

In [37]:
df_preds = pd.read_csv('../submissions/submission_all_2_2_xgb.csv',index_col=False)
df_preds = df_preds.drop('ID', axis=1)
df_preds['class'] = df_preds.idxmax(axis=1)
df_preds['class'] = df_preds['class'].str[-1].astype(int)
df_preds['class'] = df_preds['class'] -1
df_preds = df_preds.drop(['class1','class2','class3','class4','class5','class6',
                          'class7','class8','class9'],axis=1)
y_pseudo = df_preds['class'].values

In [38]:
Y = np.concatenate((y, y_pseudo), axis=0)

In [39]:
X = np.concatenate((df_train, df_test), axis=0)

In [41]:
denom = 0
fold = 10 #Change to 5, 1 for Kaggle Limits
for i in range(fold):
    params = {
        'eta': 0.03333,
        'max_depth': 6,
        'subsample' : 0.8,
        'colsample_bytree':0.8,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 9,
        'seed': i,
        'silent': True,
        'tree_method': 'gpu_hist'
    }
    x1, x2, y1, y2 = train_test_split(X, Y, test_size=0.2, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 1000,  watchlist, verbose_eval=50, early_stopping_rounds=100)
    score1 = log_loss(y2, model.predict(xgb.DMatrix(x2), 
                                                ntree_limit=model.best_ntree_limit), labels = list(range(9)))
    print(score1)
    #if score < 0.9:
    if denom != 0:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds += pred
    else:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds = pred.copy()
    denom += 1
    submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    submission['ID'] = pid
    submission.to_csv('../submissions/submission_2_2_2_xgb_fold_'  + str(i) + '.csv', index=False)


[0]	train-mlogloss:2.12674	valid-mlogloss:2.13724
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.829239	valid-mlogloss:1.10881
[100]	train-mlogloss:0.462287	valid-mlogloss:0.891414
[150]	train-mlogloss:0.295943	valid-mlogloss:0.816415
[200]	train-mlogloss:0.195164	valid-mlogloss:0.789973
[250]	train-mlogloss:0.13397	valid-mlogloss:0.782631
[300]	train-mlogloss:0.095242	valid-mlogloss:0.785123
[350]	train-mlogloss:0.069508	valid-mlogloss:0.79311
Stopping. Best iteration:
[251]	train-mlogloss:0.133008	valid-mlogloss:0.782186

0.782186010988
[0]	train-mlogloss:2.12718	valid-mlogloss:2.13406
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.823479	valid-mlogloss:1.07715
[100]	train-mlogloss:0.458378	valid-mlogloss:0.854327
[150]	train-mlog

In [42]:
submission = pd.DataFrame(preds/denom, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/submission_all_2_2_2_xgb.csv', index=False)


In [None]:
# score 3.09512 on public leader board