In [None]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather
from sklearn.base import BaseEstimator as be
from sklearn.base import TransformerMixin as tm
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [None]:
# Any results you write to the current directory are saved as output.
submission = pd.read_csv('../data/stage2_sample_submission.csv')
stage1_test = pd.read_csv('../data/test_variants')
stage2_test = pd.read_csv('../data/stage2_test_variants.csv')
stage1_solution = pd.read_csv('../data/stage1_solution_filtered.csv')

stage1_solution = stage1_solution.merge(stage1_test, how = 'left', on = 'ID')

stage2_hack = stage2_test.merge(
        stage1_solution.drop('ID', axis = 1), 
        how = 'left', 
        on = ['Gene', 'Variation'])\
    .drop(['Gene', 'Variation'], axis = 1)\
    .fillna(1)
#     .to_csv('../cache/stage2_submission.csv', index = False)


In [None]:
stage2_hack_ids = stage2_hack.ID.values

In [None]:
stage2_hack = stage2_hack.drop('ID', axis=1)

In [None]:
stage2_hack['Class'] = stage2_hack.idxmax(axis=1)

In [None]:
stage2_hack['Class'] = stage2_hack['Class'].map(lambda x: int(x[-1]))

In [None]:
stage2_hack.head()

In [None]:
stage2_hack = stage2_hack.drop(['class1','class2','class3','class4','class5','class6','class7','class8','class9'],
                               axis=1)

In [None]:
stage2_hack.head()

In [None]:
stage2_hack.shape

In [None]:
df_train = feather.read_dataframe('../cache/train_stage2_fe.feather')

In [None]:
train_ids = df_train.ID.values

In [None]:
df_train = df_train.drop('ID', axis=1)

In [None]:
y = df_train['Class'].values

In [None]:
y_hack = stage2_hack['Class'].values

In [None]:
y = np.concatenate((y, y_hack), axis=0)

In [None]:
df_test = feather.read_dataframe('../cache/test_stage2_fe.feather')
pid = df_test.ID
df_test = df_test.drop('ID', axis=1)
df_test['Class'] = y_hack

In [None]:
df_train = pd.concat((df_train, df_test), axis=0)

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
df_test = feather.read_dataframe('../cache/test_stage2_fe.feather')

In [None]:
df_test.shape

In [None]:
df_test.head()

In [None]:
########################################
## process texts in datasets
########################################


# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    my_stopwords = [
        "fig", "figure", "et", "al", "table",
        "data", "analysis", "analyze", "study",
        "method", "result", "conclusion", "author",
        "find", "found", "show", "perform",
        "demonstrate", "evaluate", "discuss"
    ]
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = [w for w in text if not w in my_stopwords]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [None]:
class cust_regression_vals(be, tm):
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x = x.drop(['Gene', 'Variation', 'ID','Text', 'GeneVar'],axis=1).values
        return x

class cust_txt_col(be, tm):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        return x[self.key].apply(str)


In [None]:

fp = Pipeline([
    ('union', FeatureUnion(
        n_jobs = -1,
        transformer_list = [
            ('standard', cust_regression_vals()),
            ('pi1', Pipeline([('Gene', cust_txt_col('Gene')), 
                                       ('count_Gene', CountVectorizer(analyzer=u'char',ngram_range=(1, 3))), 
                                       ('tsvd1', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
            ('pi2', Pipeline([('Variation', cust_txt_col('Variation')), 
                                       ('count_Variation', CountVectorizer(analyzer=u'char',ngram_range=(1, 3))), 
                                       ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
            ('pi3', Pipeline([('GeneVar', cust_txt_col('GeneVar')), 
                                       ('count_GeneVar', CountVectorizer(analyzer=u'char', ngram_range=(1, 3))), 
                                       ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, 
                                                                            random_state=12))])),
            ('pi4',Pipeline([('Text', cust_txt_col('Text')), 
                                       ('hv', HashingVectorizer(decode_error='ignore', 
                                                                                        n_features=2 ** 16,
                                                                                        non_negative=True, 
                                                                                        ngram_range=(1, 5))),
                                       ('tfidf_Text', TfidfTransformer()), 
                                       ('tsvd3', TruncatedSVD(n_components=300, n_iter=25, 
                                                                            random_state=12))]))

        
        ])
    )])


In [None]:
df_train['Text'] = [text_to_wordlist(w) for w in df_train['Text'].values]

In [None]:
df_test['Text'] = [text_to_wordlist(w) for w in df_test['Text'].values]

In [None]:
df_train.head()

In [None]:
y = df_train['Class'].values

In [None]:
df_train['ID'] = [i for i in range(df_train.shape[0])]

In [None]:
tr = df_train
te = df_test

In [None]:
# y = tr['Class'].values

In [None]:
set(te.columns) - set(tr.columns)

In [None]:
df_train = df_train.drop('Class',axis=1)

In [None]:
# df_train = df_train.drop('ID', axis=1)
# df_test = df_test.drop('ID', axis=1)

In [None]:
# df_train = tr

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train = fp.fit_transform(df_train)
print (df_train.shape)


In [None]:
df_test = fp.fit_transform(df_test)
print (df_test.shape)

In [None]:
y = y - 1 #fix for zero bound array

In [None]:
 

denom = 0
fold = 10 
for i in range(fold):
    params = {
        'eta': 0.02,
        'max_depth': 5,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 9,
        'seed': i,
        'silent': True
    }
    x1, x2, y1, y2 = train_test_split(df_train, y, test_size=0.2, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 1000,  watchlist, verbose_eval=50, early_stopping_rounds=100)
    score1 = log_loss(y2, model.predict(xgb.DMatrix(x2), 
                                                ntree_limit=model.best_ntree_limit), labels = list(range(9)))
    print(score1)
    #if score < 0.9:
    if denom != 0:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds += pred
    else:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds = pred.copy()
    denom += 1
    submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    submission['ID'] = pid
    submission.to_csv('../submissions/submission2_3_xgb_fold_'  + str(i) + '.csv', index=False)


In [None]:
submission = pd.DataFrame(preds/denom, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/submission_all_2_3_xgb.csv', index=False)


In [None]:
df_preds = pd.read_csv('../submissions/submission_all_2_3_xgb.csv',index_col=False)
df_preds = df_preds.drop('ID', axis=1)
df_preds['class'] = df_preds.idxmax(axis=1)
df_preds['class'] = df_preds['class'].str[-1].astype(int)
df_preds['class'] = df_preds['class'] -1
df_preds = df_preds.drop(['class1','class2','class3','class4','class5','class6',
                          'class7','class8','class9'],axis=1)
y_pseudo = df_preds['class'].values

In [None]:
Y = np.concatenate((y, y_pseudo), axis=0)
X = np.concatenate((df_train, df_test), axis=0)

In [None]:
denom = 0
fold = 5
for i in range(fold):
    params = {
        'eta': 0.02,
        'max_depth': 5,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 9,
        'seed': i,
        'silent': True,
        'tree_method': 'gpu_hist'
    }
    x1, x2, y1, y2 = train_test_split(X, Y, test_size=0.2, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 1000,  watchlist, verbose_eval=50, early_stopping_rounds=100)
    score1 = log_loss(y2, model.predict(xgb.DMatrix(x2), 
                                                ntree_limit=model.best_ntree_limit), labels = list(range(9)))
    print(score1)
    #if score < 0.9:
    if denom != 0:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds += pred
    else:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds = pred.copy()
    denom += 1
    submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    submission['ID'] = pid
    submission.to_csv('../submissions/submission_2_2_3_xgb_fold_'  + str(i) + '.csv', index=False)


In [None]:
submission = pd.DataFrame(preds/denom, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/submission_all_2_2_3_xgb.csv', index=False)
