In [1]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather
from sklearn.base import BaseEstimator as be
from sklearn.base import TransformerMixin as tm
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [2]:
df_train = feather.read_dataframe('../cache/train_stage1_fe.feather')

In [3]:
df_train.shape

(3321, 3285)

In [4]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Variation_lbl_enc,Variation_len,Variation_words,Text_len,Text_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2,6,20,...,1,7654,20,2,39672,6105,3213,27,3,1
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2,3,5,...,1,8255,5,1,36691,5783,1680,9,2,2
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2,3,5,...,1,5191,5,1,36691,5783,1672,9,2,2
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2,3,5,...,1,4572,5,1,36238,5625,1668,9,2,3
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2,3,5,...,1,3958,5,1,41308,6248,1666,9,2,4


In [6]:
df_test = feather.read_dataframe('../cache/test_stage1_fe.feather')

In [7]:
df_test.shape

(5668, 3284)

In [8]:
df_test.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Variation_lbl_enc,Variation_len,Variation_words,Text_len,Text_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...,ACSL4 R570S,0,1,1,5,5,...,5,1,6404,5,1,49829,7495,188,11,2
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...,NAGLU P521L,0,1,1,5,5,...,5,1,5005,5,1,31326,4762,5540,11,2
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...,PAH L333F,0,1,1,3,5,...,3,1,3915,5,1,75282,11191,6023,9,2
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...,ING1 A148D,0,1,1,4,5,...,4,1,85,5,1,53996,8439,4354,10,2
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...,TMEM216 G77A,0,1,1,7,4,...,7,1,2780,4,1,76967,11226,8211,12,2


In [9]:
pid = df_test.ID.values

In [10]:
########################################
## process texts in datasets
########################################


# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    my_stopwords = [
        "fig", "figure", "et", "al", "table",
        "data", "analysis", "analyze", "study",
        "method", "result", "conclusion", "author",
        "find", "found", "show", "perform",
        "demonstrate", "evaluate", "discuss"
    ]
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = [w for w in text if not w in my_stopwords]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [11]:
# from https://www.kaggle.com/the1owl/redefining-treatment-0-57456
class cust_regression_vals(be, tm):
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x = x.drop(['Gene', 'Variation', 'ID','Text', 'GeneVar'],axis=1).values
        return x

class cust_txt_col(be, tm):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        return x[self.key].apply(str)


In [12]:
# from https://www.kaggle.com/the1owl/redefining-treatment-0-57456
fp = Pipeline([
    ('union', FeatureUnion(
        n_jobs = -1,
        transformer_list = [
            ('standard', cust_regression_vals()),
            ('pi1', Pipeline([('Gene', cust_txt_col('Gene')), 
                                       ('count_Gene', CountVectorizer(analyzer=u'char',ngram_range=(1, 3))), 
                                       ('tsvd1', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
            ('pi2', Pipeline([('Variation', cust_txt_col('Variation')), 
                                       ('count_Variation', CountVectorizer(analyzer=u'char',ngram_range=(1, 3))), 
                                       ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
            ('pi3', Pipeline([('GeneVar', cust_txt_col('GeneVar')), 
                                       ('count_GeneVar', CountVectorizer(analyzer=u'char', ngram_range=(1, 3))), 
                                       ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, 
                                                                            random_state=12))])),
            ('pi4',Pipeline([('Text', cust_txt_col('Text')), 
                                       ('hv', HashingVectorizer(decode_error='ignore', 
                                                                                        n_features=2 ** 16,
                                                                                        non_negative=True, 
                                                                                        ngram_range=(1, 5))),
                                       ('tfidf_Text', TfidfTransformer()), 
                                       ('tsvd3', TruncatedSVD(n_components=300, n_iter=25, 
                                                                            random_state=12))]))

        
        ])
    )])


In [13]:
df_train['Text'] = [text_to_wordlist(w) for w in df_train['Text'].values]

In [14]:
df_test['Text'] = [text_to_wordlist(w) for w in df_test['Text'].values]

In [15]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Variation_lbl_enc,Variation_len,Variation_words,Text_len,Text_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,0,FAM58A,Truncating Mutations,cyclin - depend kinas cdks regul varieti funda...,FAM58A Truncating Mutations,1,1,2,6,20,...,1,7654,20,2,39672,6105,3213,27,3,1
1,1,CBL,W802*,abstract background non - small cell lung canc...,CBL W802*,1,1,2,3,5,...,1,8255,5,1,36691,5783,1680,9,2,2
2,2,CBL,Q249E,abstract background non - small cell lung canc...,CBL Q249E,1,1,2,3,5,...,1,5191,5,1,36691,5783,1672,9,2,2
3,3,CBL,N454D,recent evid demonstr acquir uniparent disomi a...,CBL N454D,1,1,2,3,5,...,1,4572,5,1,36238,5625,1668,9,2,3
4,4,CBL,L399V,oncogen mutat monomer casita b - lineag lympho...,CBL L399V,1,1,2,3,5,...,1,3958,5,1,41308,6248,1666,9,2,4


In [16]:
y = df_train['Class'].values

In [17]:
tr = df_train
te = df_test

In [18]:
# y = tr['Class'].values

In [19]:
set(te.columns) - set(tr.columns)

set()

In [20]:
df_train = df_train.drop('Class',axis=1)

In [21]:
# df_train = df_train.drop('ID', axis=1)
# df_test = df_test.drop('ID', axis=1)

In [22]:
# df_train = tr

In [23]:
df_train = fp.fit_transform(df_train)
print (df_train.shape)




(3321, 3639)


In [24]:
df_test = fp.fit_transform(df_test)
print (df_test.shape)



(5668, 3639)


In [39]:
np.save('../cache/train_stage1_fe2', df_train)
np.save('../cache/test_stage1_fe2', df_test)

In [26]:
y = y - 1 #fix for zero bound array

In [27]:
 

denom = 0
fold = 10 
for i in range(fold):
    params = {
        'eta': 0.03333,
        'max_depth': 6,
        'subsample' : 0.8,
        'colsample_bytree':0.8,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 9,
        'seed': i,
        'silent': True
    }
    x1, x2, y1, y2 = train_test_split(df_train, y, test_size=0.2, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 1000,  watchlist, verbose_eval=50, early_stopping_rounds=100)
    score1 = log_loss(y2, model.predict(xgb.DMatrix(x2), 
                                                ntree_limit=model.best_ntree_limit), labels = list(range(9)))
    print(score1)
    #if score < 0.9:
    if denom != 0:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds += pred
    else:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds = pred.copy()
    denom += 1
    submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    submission['ID'] = pid
    submission.to_csv('../submissions/sub_stage1_2_2_xgb_fold_'  + str(i) + '.csv', index=False)


[0]	train-mlogloss:2.12726	valid-mlogloss:2.14125
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.799957	valid-mlogloss:1.19866
[100]	train-mlogloss:0.438798	valid-mlogloss:1.01021
[150]	train-mlogloss:0.266687	valid-mlogloss:0.963871
[200]	train-mlogloss:0.1701	valid-mlogloss:0.953723
[250]	train-mlogloss:0.112619	valid-mlogloss:0.961239
Stopping. Best iteration:
[199]	train-mlogloss:0.17176	valid-mlogloss:0.95365

0.953650406181
[0]	train-mlogloss:2.12734	valid-mlogloss:2.1407
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.797302	valid-mlogloss:1.16073
[100]	train-mlogloss:0.435802	valid-mlogloss:0.969399
[150]	train-mlogloss:0.264644	valid-mlogloss:0.920349
[200]	train-mlogloss:0.167103	valid-mlogloss:0.915376
[250]	train-mlogloss

In [28]:
submission = pd.DataFrame(preds/denom, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/sub_stage1_all_2_2_xgb.csv', index=False)


In [29]:
len(y)

3321

In [30]:
y

array([0, 1, 1, ..., 0, 3, 3])

In [31]:
y_pseudo = preds/denom

In [32]:
len(y_pseudo)

5668

In [33]:
y_pseudo

array([[ 0.42293793,  0.04093169,  0.02649179, ...,  0.04608678,
         0.00649906,  0.00520817],
       [ 0.06321549,  0.16644001,  0.01254174, ...,  0.34201264,
         0.00726146,  0.00739465],
       [ 0.02275894,  0.08454458,  0.00732368, ...,  0.72487676,
         0.00386394,  0.00228659],
       ..., 
       [ 0.36186638,  0.03774822,  0.05082253, ...,  0.08273716,
         0.00941261,  0.0176475 ],
       [ 0.12545305,  0.11037054,  0.01478463, ...,  0.11276084,
         0.01581366,  0.01079275],
       [ 0.20397992,  0.18298969,  0.01799526, ...,  0.15102887,
         0.00847349,  0.01424411]], dtype=float32)

In [35]:
df_preds = pd.read_csv('../submissions/sub_stage1_all_2_2_xgb.csv',index_col=False)
df_preds = df_preds.drop('ID', axis=1)
df_preds['class'] = df_preds.idxmax(axis=1)
df_preds['class'] = df_preds['class'].str[-1].astype(int)
df_preds['class'] = df_preds['class'] -1
df_preds = df_preds.drop(['class1','class2','class3','class4','class5','class6',
                          'class7','class8','class9'],axis=1)
y_pseudo = df_preds['class'].values

In [36]:
Y = np.concatenate((y, y_pseudo), axis=0)

In [37]:
X = np.concatenate((df_train, df_test), axis=0)

In [40]:
denom = 0
fold = 10 
for i in range(fold):
    params = {
        'eta': 0.03333,
        'max_depth': 6,
        'subsample' : 0.8,
        'colsample_bytree':0.8,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 9,
        'seed': i,
        'silent': True,
        'tree_method': 'gpu_hist'
    }
    x1, x2, y1, y2 = train_test_split(X, Y, test_size=0.2, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 1000,  watchlist, verbose_eval=50, early_stopping_rounds=100)
    score1 = log_loss(y2, model.predict(xgb.DMatrix(x2), 
                                                ntree_limit=model.best_ntree_limit), labels = list(range(9)))
    print(score1)
    #if score < 0.9:
    if denom != 0:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds += pred
    else:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds = pred.copy()
    denom += 1
    submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    submission['ID'] = pid
    submission.to_csv('../submissions/sub_stage1_pseudo_2_2_xgb_fold_'  + str(i) + '.csv', index=False)


[0]	train-mlogloss:2.1168	valid-mlogloss:2.12448
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.796193	valid-mlogloss:0.993495


KeyboardInterrupt: 

In [None]:
submission = pd.DataFrame(preds/denom, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/sub_stage1_all_2_2_2_xgb.csv', index=False)


In [None]:
# score 0.4571 on stage1 public leader board