##Import Required Files

In [1]:
import os
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import  metrics, naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorizati on.
drive.mount('/content/drive')

eng_stopwords = nltk.download('stopwords')
pd.options.mode.chained_assignment = None


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


###Functions to train and predict for new dataset .

In [0]:
#Naive Bayes on Word Tfidf Vectorizer:
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    #print(pred_test_y.shape, pred_test_y2.shape)
    return pred_test_y, pred_test_y2, model

def log_reg(train_X, train_y, test_X, test_y, test_X2):
    weights = dict(zip((pd.Series(train_y).value_counts().index).astype(int), (pd.Series(train_y).value_counts(normalize=True).values)))
    #print(weights)
    model = LogisticRegression(C=1.0,multi_class = 'multinomial',
                               solver = 'newton-cg',class_weight=weights)
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    #print(pred_test_y.shape, pred_test_y2.shape)
    return pred_test_y, pred_test_y2, model

def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, child=1, colsample=0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 10
    param['silent'] = 1
    param['num_class'] = len(np.unique(train_y))
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 10000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, pred_test_y2, model

In [0]:
#lets load dataset into the environment
all_data = pd.read_feather('drive/My Drive/datasets/cleaned_data.ft')


In [0]:
#Split into train and test dataframes
train = all_data[~all_data.isna().any(axis=1)]
test = all_data[all_data.isna().any(axis=1)]

#lets upsample the underrepresented class 
for i in train['Labels'].value_counts().index[train['Labels'].value_counts().values <10]:
  while(train[train['Labels'] == i].shape[0]<10):
    train = train.append(train[train['Labels'] == i].reset_index(drop=True))
#train['Script'] = train['Script'].apply(remove_start)


In [0]:
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3), analyzer='word',max_df =.95,\
                            use_idf=1, sublinear_tf=1, min_df=3, max_features=20000, strip_accents='ascii')
full_tfidf = tfidf_vec.fit_transform(train['Script'].values.tolist() + test['Script'].values.tolist())
train_tfidf = tfidf_vec.transform(train['Script'].values.tolist())
test_tfidf = tfidf_vec.transform(test['Script'].values.tolist())


Probabilities of Naive bayes classifier are used as features for ensemble learning at the end

In [93]:
cv_scores = []
pred_full_test = 0
kfold_splits = 3
pred_train = np.zeros([train.shape[0], len(train['Labels'].value_counts())])
kf = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train,train['Labels']):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train['Labels'].iloc[dev_index], train['Labels'].iloc[val_index]
    pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
pred_full_test = pred_full_test / kfold_splits 

  

Mean cv score :  3.6411891659365083


In [0]:
final_dftrain = pd.DataFrame()
final_dftest = pd.DataFrame()
# Add the predictions of naivebayes classifier as new features 
for i in range(0,22,1):
  final_dftrain["nb_tfidf_char"+str(i)] = pred_train[:,i]
  final_dftest["nb_tfidf_char"+str(i)] = pred_full_test[:,i]


Probabilities of logistic regression classifier are used as features for ensemble learning at the end 

In [96]:
cv_scores = []
pred_full_test = 0
kfold_splits = 3
pred_train = np.zeros([train.shape[0], len(train['Labels'].value_counts())])
kf = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train,train['Labels']):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train['Labels'].iloc[dev_index], train['Labels'].iloc[val_index]
    pred_val_y, pred_test_y, model = log_reg(dev_X, dev_y, val_X, val_y, test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
pred_full_test = pred_full_test / kfold_splits


# add the predictions of logistic regression as  new features to learn at the end#
for i in range(0,22,1):
  final_dftrain["lr_tfidf_char"+str(i)] = pred_train[:,i]
  final_dftest["lr_tfidf_char"+str(i)] = pred_full_test[:,i]


Mean cv score :  2.784481302166791


Create 40 components with help of  SVD ,to use in the final ensemble model

In [97]:
n_comp = 40
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
    
train_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
test_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
train_svd.shape,test_svd.shape

((2065, 40), (849, 40))

In [98]:
train_df = pd.concat([train.reset_index(drop=True), train_svd], axis=1)
train_df = pd.concat([train_df, final_dftrain], axis=1)

test_df = pd.concat([test.reset_index(drop=True), test_svd], axis=1)
test_df = pd.concat([test_df, final_dftest], axis=1)

cols_to_drop = ['File_Name', 'Script']
train_X = train_df.drop(cols_to_drop+['Labels'], axis=1)
test_X = test_df.drop(cols_to_drop+['Labels'], axis=1)


((2065, 3), (2065, 40), (849, 87))

###Joined the outputs of  naive bayes & logistic regression probabilities with SVD components.

In [103]:
kfold_splits = 3
kf = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=2017)
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], len(train['Labels'].value_counts())])
for dev_index, val_index in kf.split(train_X,train_df['Labels']):
    dev_X, val_X = train_X.loc[dev_index], train_X.loc[val_index]
    dev_y, val_y = train['Labels'].iloc[dev_index], train['Labels'].iloc[val_index]
    pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y, test_X, seed_val=0, colsample=0.7)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
pred_full_test = pred_full_test / kfold_splits

[0]	train-mlogloss:2.83586	test-mlogloss:2.95544
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[20]	train-mlogloss:1.00221	test-mlogloss:2.31458
[40]	train-mlogloss:0.587272	test-mlogloss:2.25595
[60]	train-mlogloss:0.453294	test-mlogloss:2.30448
[80]	train-mlogloss:0.400465	test-mlogloss:2.37632
Stopping. Best iteration:
[42]	train-mlogloss:0.566899	test-mlogloss:2.25492

cv scores :  [2.2549159187277446]


###lets convert the output dataframe into submission form  as provided in sample.xlsx

In [104]:
test_set_preds = pd.DataFrame(pred_full_test)
test_set_preds.reset_index(inplace=True,drop=True)
final_submission = pd.merge(test['File_Name'],test_set_preds,on=test_set_preds.index).drop('key_0',axis=1)
final_submission.to_excel('test_set_preds_4.xlsx', index=False)
final_submission.head(2)

Unnamed: 0,File_Name,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,file_2300.txt,0.004492,0.003791,0.003594,0.003557,0.023252,0.00576,0.100751,0.003987,0.00383,0.003552,0.003557,0.01129,0.004565,0.003607,0.007332,0.009261,0.00435,0.003565,0.003565,0.115861,0.00552,0.004294
1,file_809.txt,0.002903,0.002979,0.002732,0.002546,0.129413,0.004127,0.03613,0.003763,0.00507,0.002542,0.002545,0.002649,0.002555,0.002935,0.003236,0.10891,0.003436,0.002551,0.002551,0.004643,0.002555,0.002561
