In [1]:
import numpy as np
import pandas as pd
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb
from tqdm import tqdm
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import multiprocessing

In [2]:
dtypes = {
    'comment_text'   : np.unicode ,
    'toxic':         np.int16, 
    'severe_toxic': np.int16,
    'obscene': np.int16,
    'threat': np.int16,
    'insult': np.int16,
    'identity_hate': np.int16
}

train = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/train.csv', dtype=dtypes, encoding='utf-8')
test = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/test.csv', dtype=dtypes, encoding='utf-8')

train.comment_text.fillna("unknown", inplace=True)
test.comment_text.fillna("unknown",  inplace=True)

subm = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"].values.astype(str)}, dtype=np.str)

In [3]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],
                                                          train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
                                                          test_size=0.2, random_state=42)

In [4]:
# Tokenization

re_tok = re.compile(u'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

transform_com = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1).fit(train['comment_text'])

comments_train = transform_com.transform(train_mes)
comments_valid = transform_com.transform(valid_mes)
comments_test = transform_com.transform(test['comment_text'])
gc.collect()

train_mes = pd.DataFrame(train_mes)
valid_mes = pd.DataFrame(valid_mes)
data = [train_mes, valid_mes, test]



In [5]:
# These features are borrowed from https://www.kaggle.com/eikedehling/feature-engineering

for element in data:
    element['total_length'] = element['comment_text'].apply(len)
    element['capitals'] = element['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    element['caps_vs_length'] = element.apply(lambda row: float(row['capitals'])/float(row['total_length']), axis=1)
    element['num_exclamation_marks'] = element['comment_text'].apply(lambda comment: comment.count('!'))
    element['num_question_marks'] = element['comment_text'].apply(lambda comment: comment.count('?'))
    element['num_punctuation'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
    element['num_symbols'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
    element['num_words'] = element['comment_text'].apply(lambda comment: len(comment.split()))
    element['num_unique_words'] = element['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    element['words_vs_unique'] = element['num_unique_words'] / element['num_words']
    element['num_smilies'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    
col = ['total_length', 'capitals', 'caps_vs_length',
       'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
       'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
       'num_smilies']

columns = ('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')

In [6]:
# Combine both the Feature Vectors and the Engineered Features

train_mes = scipy.sparse.csr_matrix(train_mes[col].values)
valid_mes = scipy.sparse.csr_matrix(valid_mes[col].values)
test = scipy.sparse.csr_matrix(test[col].values)


comments_train = scipy.sparse.hstack([train_mes.tocsr(),comments_train.tocsr()])
comments_valid = scipy.sparse.hstack([valid_mes,comments_valid])
comments_test = scipy.sparse.hstack([test,comments_test])

### XGBoost

In [7]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'valid') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=10)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [8]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(comments_train, train_l[j], comments_valid, valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(comments_test), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
[0]	train-auc:0.77069	valid-auc:0.76043
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
[1]	train-auc:0.81829	valid-auc:0.80742
[2]	train-auc:0.84045	valid-auc:0.83068
[3]	train-auc:0.84214	valid-auc:0.83285
[4]	train-auc:0.84724	valid-auc:0.83778
[5]	train-auc:0.85171	valid-auc:0.84059
[6]	train-auc:0.85736	valid-auc:0.84577
[7]	train-auc:0.85838	valid-auc:0.84684
[8]	train-auc:0.86118	valid-auc:0.85071
[9]	train-auc:0.86550	valid-auc:0.85320
[10]	train-auc:0.86760	valid-auc:0.85520
[11]	train-auc:0.87045	valid-auc:0.85660
[12]	train-auc:0.87269	valid-auc:0.85781
[13]	train-auc:0.87454	valid-auc:0.85902
[14]	train-auc:0.87793	valid-auc:0.86217
[15]	train-auc:0.88193	valid-auc:0.86486
[16]	train-auc:0.88389	valid-auc:0.86649
[17]	train-auc:0.88510	valid-auc:0.86769
[18]	train-auc:0.88764	valid-auc:0.87034
[19]	train-auc:0.89046	valid-auc:0.87304
[20]	train-auc:0.89307	valid-auc:0.87

[195]	train-auc:0.97965	valid-auc:0.93950
[196]	train-auc:0.97972	valid-auc:0.93953
[197]	train-auc:0.97994	valid-auc:0.93955
[198]	train-auc:0.97998	valid-auc:0.93958
[199]	train-auc:0.98011	valid-auc:0.93965
[200]	train-auc:0.98017	valid-auc:0.93963
[201]	train-auc:0.98032	valid-auc:0.93961
[202]	train-auc:0.98045	valid-auc:0.93968
[203]	train-auc:0.98051	valid-auc:0.93961
[204]	train-auc:0.98070	valid-auc:0.93982
[205]	train-auc:0.98083	valid-auc:0.93982
[206]	train-auc:0.98089	valid-auc:0.93981
[207]	train-auc:0.98097	valid-auc:0.93975
[208]	train-auc:0.98108	valid-auc:0.93976
[209]	train-auc:0.98118	valid-auc:0.93979
[210]	train-auc:0.98137	valid-auc:0.93989
[211]	train-auc:0.98145	valid-auc:0.93994
[212]	train-auc:0.98155	valid-auc:0.93996
[213]	train-auc:0.98162	valid-auc:0.94001
[214]	train-auc:0.98174	valid-auc:0.94010
[215]	train-auc:0.98182	valid-auc:0.94022
[216]	train-auc:0.98190	valid-auc:0.94017
[217]	train-auc:0.98209	valid-auc:0.94020
[218]	train-auc:0.98218	valid-auc:

[19]	train-auc:0.94934	valid-auc:0.92375
[20]	train-auc:0.95383	valid-auc:0.93523
[21]	train-auc:0.96298	valid-auc:0.94974
[22]	train-auc:0.96462	valid-auc:0.95050
[23]	train-auc:0.96647	valid-auc:0.95059
[24]	train-auc:0.96670	valid-auc:0.95113
[25]	train-auc:0.96864	valid-auc:0.95241
[26]	train-auc:0.96894	valid-auc:0.95277
[27]	train-auc:0.97371	valid-auc:0.96145
[28]	train-auc:0.97558	valid-auc:0.96274
[29]	train-auc:0.97698	valid-auc:0.96272
[30]	train-auc:0.97773	valid-auc:0.96285
[31]	train-auc:0.97779	valid-auc:0.96273
[32]	train-auc:0.97777	valid-auc:0.96259
[33]	train-auc:0.97859	valid-auc:0.96488
[34]	train-auc:0.97971	valid-auc:0.96478
[35]	train-auc:0.98038	valid-auc:0.96591
[36]	train-auc:0.98074	valid-auc:0.96615
[37]	train-auc:0.98469	valid-auc:0.97003
[38]	train-auc:0.98504	valid-auc:0.97005
[39]	train-auc:0.98619	valid-auc:0.97032
[40]	train-auc:0.98718	valid-auc:0.97019
[41]	train-auc:0.98746	valid-auc:0.96995
[42]	train-auc:0.98803	valid-auc:0.97009
[43]	train-auc:0

[99]	train-auc:0.98824	valid-auc:0.96336
[100]	train-auc:0.98836	valid-auc:0.96341
[101]	train-auc:0.98846	valid-auc:0.96366
[102]	train-auc:0.98866	valid-auc:0.96377
[103]	train-auc:0.98882	valid-auc:0.96396
[104]	train-auc:0.98899	valid-auc:0.96396
[105]	train-auc:0.98904	valid-auc:0.96396
[106]	train-auc:0.98914	valid-auc:0.96394
[107]	train-auc:0.98919	valid-auc:0.96404
[108]	train-auc:0.98931	valid-auc:0.96402
[109]	train-auc:0.98941	valid-auc:0.96401
[110]	train-auc:0.98963	valid-auc:0.96436
[111]	train-auc:0.98976	valid-auc:0.96457
[112]	train-auc:0.98988	valid-auc:0.96461
[113]	train-auc:0.98996	valid-auc:0.96471
[114]	train-auc:0.99003	valid-auc:0.96491
[115]	train-auc:0.99016	valid-auc:0.96496
[116]	train-auc:0.99028	valid-auc:0.96496
[117]	train-auc:0.99031	valid-auc:0.96502
[118]	train-auc:0.99050	valid-auc:0.96526
[119]	train-auc:0.99065	valid-auc:0.96529
[120]	train-auc:0.99074	valid-auc:0.96529
[121]	train-auc:0.99085	valid-auc:0.96528
[122]	train-auc:0.99102	valid-auc:0

[58]	train-auc:0.99542	valid-auc:0.96302
[59]	train-auc:0.99584	valid-auc:0.96060
[60]	train-auc:0.99635	valid-auc:0.95990
[61]	train-auc:0.99646	valid-auc:0.96271
[62]	train-auc:0.99675	valid-auc:0.96337
[63]	train-auc:0.99675	valid-auc:0.96244
[64]	train-auc:0.99707	valid-auc:0.96131
[65]	train-auc:0.99733	valid-auc:0.96454
[66]	train-auc:0.99765	valid-auc:0.96514
[67]	train-auc:0.99783	valid-auc:0.96582
[68]	train-auc:0.99787	valid-auc:0.96522
[69]	train-auc:0.99788	valid-auc:0.96484
[70]	train-auc:0.99814	valid-auc:0.96534
[71]	train-auc:0.99826	valid-auc:0.96645
[72]	train-auc:0.99840	valid-auc:0.96690
[73]	train-auc:0.99846	valid-auc:0.96599
[74]	train-auc:0.99850	valid-auc:0.96688
[75]	train-auc:0.99853	valid-auc:0.96907
[76]	train-auc:0.99872	valid-auc:0.97122
[77]	train-auc:0.99878	valid-auc:0.97155
[78]	train-auc:0.99888	valid-auc:0.97122
[79]	train-auc:0.99889	valid-auc:0.97192
[80]	train-auc:0.99892	valid-auc:0.97303
[81]	train-auc:0.99903	valid-auc:0.97273
[82]	train-auc:0

[155]	train-auc:0.98902	valid-auc:0.95354
[156]	train-auc:0.98913	valid-auc:0.95367
[157]	train-auc:0.98918	valid-auc:0.95371
[158]	train-auc:0.98926	valid-auc:0.95385
[159]	train-auc:0.98938	valid-auc:0.95378
[160]	train-auc:0.98944	valid-auc:0.95376
[161]	train-auc:0.98953	valid-auc:0.95381
[162]	train-auc:0.98964	valid-auc:0.95389
[163]	train-auc:0.98979	valid-auc:0.95405
[164]	train-auc:0.98988	valid-auc:0.95411
[165]	train-auc:0.98997	valid-auc:0.95416
[166]	train-auc:0.99002	valid-auc:0.95417
[167]	train-auc:0.99012	valid-auc:0.95424
[168]	train-auc:0.99020	valid-auc:0.95433
[169]	train-auc:0.99026	valid-auc:0.95431
[170]	train-auc:0.99038	valid-auc:0.95426
[171]	train-auc:0.99047	valid-auc:0.95422
[172]	train-auc:0.99051	valid-auc:0.95424
[173]	train-auc:0.99060	valid-auc:0.95428
[174]	train-auc:0.99069	valid-auc:0.95420
[175]	train-auc:0.99082	valid-auc:0.95436
[176]	train-auc:0.99089	valid-auc:0.95437
[177]	train-auc:0.99100	valid-auc:0.95441
[178]	train-auc:0.99106	valid-auc:

In [9]:
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('sample_submission_xgb.csv', index=False)