In [1]:
import numpy as np
import pandas as pd
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import multiprocessing

In [2]:
dtypes = {
    'comment_text'   : np.unicode ,
    'toxic':         np.int16, 
    'severe_toxic': np.int16,
    'obscene': np.int16,
    'threat': np.int16,
    'insult': np.int16,
    'identity_hate': np.int16
}

train = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/train.csv', dtype=dtypes, encoding='utf-8')
test = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/test.csv', dtype=dtypes, encoding='utf-8')

train.comment_text.fillna("unknown", inplace=True)
test.comment_text.fillna("unknown",  inplace=True)

subm = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"].values.astype(str)}, dtype=np.str)

In [3]:
# Tokenization

re_tok = re.compile(u'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

### GloVe Word Vectors

In [4]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],
                                                          train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
                                                          test_size=0.2, random_state=42)
train_mes = pd.DataFrame(train_mes)
valid_mes = pd.DataFrame(valid_mes)

In [5]:
# load the GloVe vectors in a dictionary:
from tqdm import tqdm

embeddings_index = {}
f = open('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/glove.6B.300d.txt', encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        continue
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

400000it [01:39, 4007.86it/s]

Found 400000 word vectors.





In [6]:
# Function to create a normalized vector for the whole sentence
def sent2vec(s, embeddings_index):
    words = str(s).lower()
    words = tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [7]:
gc.collect()

# Create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x, embeddings_index) for x in train_mes['comment_text']]
xvalid_glove = [sent2vec(x, embeddings_index) for x in valid_mes['comment_text']]

xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

print("xtrain_glove.shape = ", xtrain_glove.shape)
print("xvalid_glove.shape = ", xvalid_glove.shape)

# Generate Word vectors of test data
xtest_glove = [sent2vec(x, embeddings_index) for x in test['comment_text']]
xtest_glove = np.array(xtest_glove)

print("xtest_glove.shape = ", xtest_glove.shape)

xtrain_glove.shape =  (127656, 300)
xvalid_glove.shape =  (31915, 300)
xtest_glove.shape =  (153164, 300)


### SVM Classifier

In [9]:
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = SGDClassifier(loss='hinge', max_iter=1000, epsilon=0.001, n_jobs=-1, class_weight='balanced')

    cv_score = np.mean(cross_val_score(classifier, xtrain_glove, train_l[class_name], cv=5, scoring='roc_auc'))
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_glove, train_l[class_name])
    
    val_score = classifier.score(xvalid_glove, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))

fit toxic
CV score for class toxic is 0.8019603686359165
Validation score for class toxic is 0.8407018643271189


### Logistic Regression Classifier

In [10]:
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']

preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = LogisticRegression(C=0.1, solver='sag', class_weight='balanced', max_iter=1000)

    cv_score = np.mean(cross_val_score(classifier, xtrain_glove, train_l[class_name], cv=5, scoring='roc_auc'))
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_glove, train_l[class_name])
    
    val_score = classifier.score(xvalid_glove, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))

fit toxic
CV score for class toxic is 0.7948130943160403
Validation score for class toxic is 0.7858687137709541


### XGBoost Classifier

In [11]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'valid') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [12]:
#col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']

preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(xtrain_glove, train_l[j], xvalid_glove, valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(xtest_glove), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
[0]	train-auc:0.77183	valid-auc:0.76191
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[1]	train-auc:0.79460	valid-auc:0.78366
[2]	train-auc:0.80376	valid-auc:0.79042
[3]	train-auc:0.80897	valid-auc:0.79461
[4]	train-auc:0.81179	valid-auc:0.79642
[5]	train-auc:0.81391	valid-auc:0.79725
[6]	train-auc:0.81632	valid-auc:0.79908
[7]	train-auc:0.81831	valid-auc:0.80098
[8]	train-auc:0.82029	valid-auc:0.80279
[9]	train-auc:0.82166	valid-auc:0.80347
[10]	train-auc:0.82321	valid-auc:0.80401
[11]	train-auc:0.82420	valid-auc:0.80505
[12]	train-auc:0.82610	valid-auc:0.80594
[13]	train-auc:0.82753	valid-auc:0.80681
[14]	train-auc:0.82893	valid-auc:0.80813
[15]	train-auc:0.83034	valid-auc:0.80803
[16]	train-auc:0.83161	valid-auc:0.80883
[17]	train-auc:0.83289	valid-auc:0.80936
[18]	train-auc:0.83432	valid-auc:0.81016
[19]	train-auc:0.83593	valid-auc:0.81071
[20]	train-auc:0.83703	valid-auc:0.81

[195]	train-auc:0.94733	valid-auc:0.84846
[196]	train-auc:0.94773	valid-auc:0.84848
[197]	train-auc:0.94802	valid-auc:0.84842
[198]	train-auc:0.94833	valid-auc:0.84835
[199]	train-auc:0.94856	valid-auc:0.84844
[200]	train-auc:0.94878	valid-auc:0.84826
[201]	train-auc:0.94918	valid-auc:0.84808
[202]	train-auc:0.94948	valid-auc:0.84811
[203]	train-auc:0.94973	valid-auc:0.84812
[204]	train-auc:0.94995	valid-auc:0.84825
[205]	train-auc:0.95032	valid-auc:0.84843
[206]	train-auc:0.95056	valid-auc:0.84827
[207]	train-auc:0.95089	valid-auc:0.84827
[208]	train-auc:0.95122	valid-auc:0.84850
[209]	train-auc:0.95154	valid-auc:0.84845
[210]	train-auc:0.95189	valid-auc:0.84855
[211]	train-auc:0.95216	valid-auc:0.84839
[212]	train-auc:0.95246	valid-auc:0.84848
[213]	train-auc:0.95274	valid-auc:0.84857
[214]	train-auc:0.95304	valid-auc:0.84856
[215]	train-auc:0.95339	valid-auc:0.84845
[216]	train-auc:0.95364	valid-auc:0.84856
[217]	train-auc:0.95398	valid-auc:0.84865
[218]	train-auc:0.95437	valid-auc:

In [13]:
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('sample_submission_glove_xgb.csv', index=False)