In [1]:
import numpy as np
import pandas as pd
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import multiprocessing

In [2]:
dtypes = {
    'comment_text'   : np.unicode ,
    'toxic':         np.int16, 
    'severe_toxic': np.int16,
    'obscene': np.int16,
    'threat': np.int16,
    'insult': np.int16,
    'identity_hate': np.int16
}

train = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/train.csv', dtype=dtypes, encoding='utf-8')
test = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/test.csv', dtype=dtypes, encoding='utf-8')

train.comment_text.fillna("unknown", inplace=True)
test.comment_text.fillna("unknown",  inplace=True)

subm = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"].values.astype(str)}, dtype=np.str)

In [3]:
# Tokenization

re_tok = re.compile(u'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

### Google Word2Vec

In [4]:
# # Convertion of .bin to .txt file
# from gensim.models.keyedvectors import KeyedVectors

# model = KeyedVectors.load_word2vec_format('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Datasets/GoogleNews-vectors-negative300.bin', binary=True)
# model.save_word2vec_format('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/GoogleNews-vectors-negative300.txt', binary=False)

In [5]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],
                                                          train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
                                                          test_size=0.2, random_state=42)
train_mes = pd.DataFrame(train_mes)
valid_mes = pd.DataFrame(valid_mes)

In [6]:
# load the Google word2vec vectors in a dictionary:
from tqdm import tqdm

embeddings_index_word2vec = {}
f = open('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/GoogleNews-vectors-negative300.txt', encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        continue
    embeddings_index_word2vec[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index_word2vec))

3000001it [12:49, 3898.55it/s]


Found 3000000 word vectors.


In [7]:
# Function to create a normalized vector for the whole sentence
def sent2vec(s, embeddings_index):
    words = str(s).lower()
    words = tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [8]:
gc.collect()

# Create sentence vectors using the above function for training and validation set
xtrain_word2vec = [sent2vec(x, embeddings_index_word2vec) for x in train_mes['comment_text']]
xvalid_word2vec = [sent2vec(x, embeddings_index_word2vec) for x in valid_mes['comment_text']]

xtrain_word2vec = np.array(xtrain_word2vec)
xvalid_word2vec = np.array(xvalid_word2vec)

print("xtrain_word2vec.shape = ", xtrain_word2vec.shape)
print("xvalid_word2vec.shape = ", xvalid_word2vec.shape)
 
# Generate Word vectors of test data
xtest_word2vec = [sent2vec(x, embeddings_index_word2vec) for x in test['comment_text']]
xtest_word2vec = np.array(xtest_word2vec)

print("xtest_word2vec.shape = ", xtest_word2vec.shape)

xtrain_word2vec.shape =  (127656, 300)
xvalid_word2vec.shape =  (31915, 300)
xtest_word2vec.shape =  (153164, 300)


### SVM Classifier

In [9]:
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = SGDClassifier(loss='log', max_iter=1000, epsilon=0.001, n_jobs=-1, class_weight='balanced')

    cv_score = np.mean(cross_val_score(classifier, xtrain_word2vec, train_l[class_name], cv=5, scoring='roc_auc'))
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_word2vec, train_l[class_name])
    
    val_score = classifier.score(xvalid_word2vec, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))

fit toxic
CV score for class toxic is 0.7929348113394405
Validation score for class toxic is 0.8205545981513395


### Logistic Regression Classifier

In [10]:
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = LogisticRegression(C=0.1, solver='sag', class_weight='balanced', max_iter=1000)

    cv_score = np.mean(cross_val_score(classifier, xtrain_word2vec, train_l[class_name], cv=5, scoring='roc_auc'))
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_word2vec, train_l[class_name])
    
    val_score = classifier.score(xvalid_word2vec, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))

fit toxic
CV score for class toxic is 0.7931395513529272
Validation score for class toxic is 0.7824847250509165


### XGBoost Classifier

In [11]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'valid') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [12]:
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(xtrain_word2vec, train_l[j], xvalid_word2vec, valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(xtest_word2vec), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
[0]	train-auc:0.77237	valid-auc:0.76232
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[1]	train-auc:0.79239	valid-auc:0.77890
[2]	train-auc:0.79949	valid-auc:0.78562
[3]	train-auc:0.80285	valid-auc:0.78797
[4]	train-auc:0.80881	valid-auc:0.79109
[5]	train-auc:0.81184	valid-auc:0.79236
[6]	train-auc:0.81453	valid-auc:0.79402
[7]	train-auc:0.81656	valid-auc:0.79526
[8]	train-auc:0.81886	valid-auc:0.79620
[9]	train-auc:0.82152	valid-auc:0.79765
[10]	train-auc:0.82240	valid-auc:0.79817
[11]	train-auc:0.82398	valid-auc:0.79936
[12]	train-auc:0.82595	valid-auc:0.80060
[13]	train-auc:0.82786	valid-auc:0.80204
[14]	train-auc:0.82911	valid-auc:0.80300
[15]	train-auc:0.83077	valid-auc:0.80427
[16]	train-auc:0.83280	valid-auc:0.80630
[17]	train-auc:0.83425	valid-auc:0.80722
[18]	train-auc:0.83605	valid-auc:0.80855
[19]	train-auc:0.83673	valid-auc:0.80883
[20]	train-auc:0.83820	valid-auc:0.80

[195]	train-auc:0.94898	valid-auc:0.84482
[196]	train-auc:0.94911	valid-auc:0.84494
[197]	train-auc:0.94958	valid-auc:0.84485
[198]	train-auc:0.94998	valid-auc:0.84492
[199]	train-auc:0.95020	valid-auc:0.84493
[200]	train-auc:0.95049	valid-auc:0.84509
[201]	train-auc:0.95071	valid-auc:0.84520
[202]	train-auc:0.95113	valid-auc:0.84506
[203]	train-auc:0.95129	valid-auc:0.84514
[204]	train-auc:0.95166	valid-auc:0.84515
[205]	train-auc:0.95193	valid-auc:0.84533
[206]	train-auc:0.95225	valid-auc:0.84537
[207]	train-auc:0.95253	valid-auc:0.84541
[208]	train-auc:0.95293	valid-auc:0.84544
[209]	train-auc:0.95339	valid-auc:0.84536
[210]	train-auc:0.95356	valid-auc:0.84549
[211]	train-auc:0.95383	valid-auc:0.84545
[212]	train-auc:0.95418	valid-auc:0.84555
[213]	train-auc:0.95452	valid-auc:0.84555
[214]	train-auc:0.95471	valid-auc:0.84547
[215]	train-auc:0.95506	valid-auc:0.84551
[216]	train-auc:0.95531	valid-auc:0.84559
[217]	train-auc:0.95550	valid-auc:0.84567
[218]	train-auc:0.95583	valid-auc:

In [13]:
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('sample_submission_word2vec_xgb.csv', index=False)