In [1]:
import numpy as np
import pandas as pd
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import multiprocessing

In [2]:
dtypes = {
    'comment_text'   : np.unicode ,
    'toxic':         np.int16, 
    'severe_toxic': np.int16,
    'obscene': np.int16,
    'threat': np.int16,
    'insult': np.int16,
    'identity_hate': np.int16
}

train = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/train.csv', dtype=dtypes, encoding='utf-8')
test = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/test.csv', dtype=dtypes, encoding='utf-8')

train.comment_text.fillna("unknown", inplace=True)
test.comment_text.fillna("unknown",  inplace=True)

subm = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"].values.astype(str)}, dtype=np.str)

In [3]:
# Tokenization

re_tok = re.compile(u'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

In [4]:
# # These features are borrowed from https://www.kaggle.com/eikedehling/feature-engineering

# for element in data:
#     element['total_length'] = element['comment_text'].apply(len)
#     element['capitals'] = element['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
#     element['caps_vs_length'] = element.apply(lambda row: float(row['capitals'])/float(row['total_length']), axis=1)
#     element['num_exclamation_marks'] = element['comment_text'].apply(lambda comment: comment.count('!'))
#     element['num_question_marks'] = element['comment_text'].apply(lambda comment: comment.count('?'))
#     element['num_punctuation'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
#     element['num_symbols'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
#     element['num_words'] = element['comment_text'].apply(lambda comment: len(comment.split()))
#     element['num_unique_words'] = element['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
#     element['words_vs_unique'] = element['num_unique_words'] / element['num_words']
#     element['num_smilies'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    
# col = ['total_length', 'capitals', 'caps_vs_length',
#        'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
#        'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
#        'num_smilies']

# columns = ('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')

### GloVe Word Vectors

In [5]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],
                                                          train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
                                                          test_size=0.2, random_state=42)
train_mes = pd.DataFrame(train_mes)
valid_mes = pd.DataFrame(valid_mes)

In [6]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        continue
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [7]:
# Function to create a normalized vector for the whole sentence
def sent2vec(s, embeddings_index):
    words = str(s).lower()
    words = tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [8]:
gc.collect()

# Create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x, embeddings_index) for x in train_mes['comment_text']]
xvalid_glove = [sent2vec(x, embeddings_index) for x in valid_mes['comment_text']]

xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

print("xtrain_glove.shape = ", xtrain_glove.shape)
print("xvalid_glove.shape = ", xvalid_glove.shape)

# Generate Word vectors of test data
xtest_glove = [sent2vec(x, embeddings_index) for x in test['comment_text']]
xtest_glove = np.array(xtest_glove)

xtrain_glove.shape =  (127656, 300)
xvalid_glove.shape =  (31915, 300)


### SGD Classifier

In [9]:
scores = []
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = SGDClassifier(loss='log', max_iter=1000, epsilon=0.001, n_jobs=-1)

    cv_score = np.mean(cross_val_score(classifier, xtrain_glove, train_l[class_name], cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_glove, train_l[class_name])
    preds[:,i] = classifier.predict_proba(xvalid_glove)[:,1]
    
print('Total CV score is {}'.format(np.mean(scores)))

fit toxic
CV score for class toxic is 0.7774531814868076
fit severe_toxic
CV score for class severe_toxic is 0.9029774888695542
fit obscene
CV score for class obscene is 0.8209661656956003
fit threat
CV score for class threat is 0.8490649348881352
fit insult
CV score for class insult is 0.8179873710268509
fit identity_hate
CV score for class identity_hate is 0.8150334304469109
Total CV score is 0.8305804287356432


In [10]:
testpreds_glove = np.zeros((test.shape[0], len(col)))
for i, class_name in enumerate(col):
    print('fit '+ class_name)
    testpreds_glove[:,i] = classifier.predict_proba(xtest_glove)[:,1]
    
submission = pd.concat([submid, pd.DataFrame(testpreds_glove, columns = col)], axis=1)
submission.to_csv('sample_submission_glove_sgd.csv', index=False)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


### Logistic Regression Classifier

In [11]:
scores = []
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = LogisticRegression(C=0.1, solver='sag')
#    classifier = SVC(C=1.0, probability=True)

    cv_score = np.mean(cross_val_score(classifier, xtrain_glove, train_l[class_name], cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_glove, train_l[class_name])
    preds[:,i] = classifier.predict_proba(xvalid_glove)[:,1]
    
print('Total CV score is {}'.format(np.mean(scores)))

fit toxic
CV score for class toxic is 0.7777078113054194
fit severe_toxic
CV score for class severe_toxic is 0.9031895790677194
fit obscene
CV score for class obscene is 0.8212190814446563
fit threat
CV score for class threat is 0.8493295229506594
fit insult
CV score for class insult is 0.81829666476496
fit identity_hate
CV score for class identity_hate is 0.8151084490817322
Total CV score is 0.8308085181025243


In [12]:
testpreds_glove = np.zeros((test.shape[0], len(col)))
for i, class_name in enumerate(col):
    print('fit '+ class_name)
    testpreds_glove[:,i] = classifier.predict_proba(xtest_glove)[:,1]
    
submission = pd.concat([submid, pd.DataFrame(testpreds_glove, columns = col)], axis=1)
submission.to_csv('sample_submission_glove_logit.csv', index=False)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


### XGBoost Classifier

In [13]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'valid') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [14]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(xtrain_glove, train_l[j], xvalid_glove, valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(xtest_glove), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
[0]	train-auc:0.77183	valid-auc:0.76191
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[1]	train-auc:0.79460	valid-auc:0.78366
[2]	train-auc:0.80376	valid-auc:0.79042
[3]	train-auc:0.80897	valid-auc:0.79461
[4]	train-auc:0.81179	valid-auc:0.79642
[5]	train-auc:0.81391	valid-auc:0.79725
[6]	train-auc:0.81632	valid-auc:0.79908
[7]	train-auc:0.81831	valid-auc:0.80098
[8]	train-auc:0.82029	valid-auc:0.80279
[9]	train-auc:0.82166	valid-auc:0.80347
[10]	train-auc:0.82321	valid-auc:0.80401
[11]	train-auc:0.82420	valid-auc:0.80505
[12]	train-auc:0.82610	valid-auc:0.80594
[13]	train-auc:0.82753	valid-auc:0.80681
[14]	train-auc:0.82893	valid-auc:0.80813
[15]	train-auc:0.83034	valid-auc:0.80803
[16]	train-auc:0.83161	valid-auc:0.80883
[17]	train-auc:0.83289	valid-auc:0.80936
[18]	train-auc:0.83432	valid-auc:0.81016
[19]	train-auc:0.83593	valid-auc:0.81071
[20]	train-auc:0.83703	valid-auc:0.81

[195]	train-auc:0.94733	valid-auc:0.84846
[196]	train-auc:0.94773	valid-auc:0.84848
[197]	train-auc:0.94802	valid-auc:0.84842
[198]	train-auc:0.94833	valid-auc:0.84835
[199]	train-auc:0.94856	valid-auc:0.84844
[200]	train-auc:0.94878	valid-auc:0.84826
[201]	train-auc:0.94918	valid-auc:0.84808
[202]	train-auc:0.94948	valid-auc:0.84811
[203]	train-auc:0.94973	valid-auc:0.84812
[204]	train-auc:0.94995	valid-auc:0.84825
[205]	train-auc:0.95032	valid-auc:0.84843
[206]	train-auc:0.95056	valid-auc:0.84827
[207]	train-auc:0.95089	valid-auc:0.84827
[208]	train-auc:0.95122	valid-auc:0.84850
[209]	train-auc:0.95154	valid-auc:0.84845
[210]	train-auc:0.95189	valid-auc:0.84855
[211]	train-auc:0.95216	valid-auc:0.84839
[212]	train-auc:0.95246	valid-auc:0.84848
[213]	train-auc:0.95274	valid-auc:0.84857
[214]	train-auc:0.95304	valid-auc:0.84856
[215]	train-auc:0.95339	valid-auc:0.84845
[216]	train-auc:0.95364	valid-auc:0.84856
[217]	train-auc:0.95398	valid-auc:0.84865
[218]	train-auc:0.95437	valid-auc:

[21]	train-auc:0.87507	valid-auc:0.84523
[22]	train-auc:0.87655	valid-auc:0.84609
[23]	train-auc:0.87816	valid-auc:0.84740
[24]	train-auc:0.88004	valid-auc:0.84882
[25]	train-auc:0.88144	valid-auc:0.84946
[26]	train-auc:0.88229	valid-auc:0.85014
[27]	train-auc:0.88447	valid-auc:0.85120
[28]	train-auc:0.88583	valid-auc:0.85206
[29]	train-auc:0.88700	valid-auc:0.85271
[30]	train-auc:0.88911	valid-auc:0.85347
[31]	train-auc:0.89047	valid-auc:0.85407
[32]	train-auc:0.89168	valid-auc:0.85488
[33]	train-auc:0.89324	valid-auc:0.85540
[34]	train-auc:0.89488	valid-auc:0.85640
[35]	train-auc:0.89636	valid-auc:0.85750
[36]	train-auc:0.89811	valid-auc:0.85833
[37]	train-auc:0.89965	valid-auc:0.85883
[38]	train-auc:0.90105	valid-auc:0.85967
[39]	train-auc:0.90231	valid-auc:0.86042
[40]	train-auc:0.90318	valid-auc:0.86082
[41]	train-auc:0.90470	valid-auc:0.86207
[42]	train-auc:0.90604	valid-auc:0.86281
[43]	train-auc:0.90751	valid-auc:0.86381
[44]	train-auc:0.90896	valid-auc:0.86458
[45]	train-auc:0

[16]	train-auc:0.87148	valid-auc:0.83898
[17]	train-auc:0.87260	valid-auc:0.83905
[18]	train-auc:0.87393	valid-auc:0.84082
[19]	train-auc:0.87495	valid-auc:0.84126
[20]	train-auc:0.87606	valid-auc:0.84149
[21]	train-auc:0.87802	valid-auc:0.84197
[22]	train-auc:0.88009	valid-auc:0.84177
[23]	train-auc:0.88166	valid-auc:0.84274
[24]	train-auc:0.88300	valid-auc:0.84391
[25]	train-auc:0.88446	valid-auc:0.84575
[26]	train-auc:0.88570	valid-auc:0.84657
[27]	train-auc:0.88694	valid-auc:0.84705
[28]	train-auc:0.88763	valid-auc:0.84730
[29]	train-auc:0.88905	valid-auc:0.84791
[30]	train-auc:0.89063	valid-auc:0.84878
[31]	train-auc:0.89203	valid-auc:0.85027
[32]	train-auc:0.89328	valid-auc:0.85113
[33]	train-auc:0.89448	valid-auc:0.85198
[34]	train-auc:0.89589	valid-auc:0.85285
[35]	train-auc:0.89746	valid-auc:0.85318
[36]	train-auc:0.89909	valid-auc:0.85388
[37]	train-auc:0.90029	valid-auc:0.85443
[38]	train-auc:0.90171	valid-auc:0.85533
[39]	train-auc:0.90307	valid-auc:0.85584
[40]	train-auc:0

[214]	train-auc:0.98314	valid-auc:0.87649
[215]	train-auc:0.98336	valid-auc:0.87635
[216]	train-auc:0.98356	valid-auc:0.87634
[217]	train-auc:0.98370	valid-auc:0.87624
[218]	train-auc:0.98384	valid-auc:0.87601
[219]	train-auc:0.98403	valid-auc:0.87584
[220]	train-auc:0.98414	valid-auc:0.87585
[221]	train-auc:0.98431	valid-auc:0.87592
[222]	train-auc:0.98448	valid-auc:0.87577
[223]	train-auc:0.98466	valid-auc:0.87574
[224]	train-auc:0.98490	valid-auc:0.87577
[225]	train-auc:0.98524	valid-auc:0.87583
[226]	train-auc:0.98536	valid-auc:0.87610
Stopping. Best iteration:
[206]	train-auc:0.98156	valid-auc:0.87693

fit identity_hate
[0]	train-auc:0.76559	valid-auc:0.71291
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[1]	train-auc:0.80071	valid-auc:0.74973
[2]	train-auc:0.81587	valid-auc:0.75536
[3]	train-auc:0.81795	valid-auc:0.75788
[4]	train-auc:0.82725	valid-auc:0.76088
[5]	train-auc:0.82792	va

In [15]:
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('sample_submission_glove_xgb.csv', index=False)