In [1]:
import numpy as np
import pandas as pd
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import multiprocessing

In [2]:
dtypes = {
    'comment_text'   : np.unicode ,
    'toxic':         np.int16, 
    'severe_toxic': np.int16,
    'obscene': np.int16,
    'threat': np.int16,
    'insult': np.int16,
    'identity_hate': np.int16
}

train = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/train.csv', dtype=dtypes, encoding='utf-8')
test = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/test.csv', dtype=dtypes, encoding='utf-8')

train.comment_text.fillna("unknown", inplace=True)
test.comment_text.fillna("unknown",  inplace=True)

subm = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"].values.astype(str)}, dtype=np.str)

In [3]:
# Tokenization

re_tok = re.compile(u'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

### FastText

In [4]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],
                                                          train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
                                                          test_size=0.2, random_state=42)
train_mes = pd.DataFrame(train_mes)
valid_mes = pd.DataFrame(valid_mes)

In [5]:
# load the FastText vectors in a dictionary:
from tqdm import tqdm

embeddings_index_FastText = {}
f = open('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Datasets/crawl-300d-2M.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0jhng .]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        continue
    embeddings_index_FastText[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index_FastText))

1999996it [07:35, 4387.86it/s]


Found 1999996 word vectors.


In [6]:
# Function to create a normalized vector for the whole sentence
def sent2vec(s, embeddings_index):
    words = str(s).lower()
    words = tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [7]:
gc.collect()

# Create sentence vectors using the above function for training and validation set
xtrain_FastText = [sent2vec(x, embeddings_index_FastText) for x in train_mes['comment_text']]
xvalid_FastText = [sent2vec(x, embeddings_index_FastText) for x in valid_mes['comment_text']]

xtrain_FastText = np.array(xtrain_FastText)
xvalid_FastText = np.array(xvalid_FastText)

print("xtrain_FastText.shape = ", xtrain_FastText.shape)
print("xvalid_FastText.shape = ", xvalid_FastText.shape)
 
# Generate Word vectors of test data
xtest_FastText = [sent2vec(x, embeddings_index_FastText) for x in test['comment_text']]
xtest_FastText = np.array(xtest_FastText)

print("xtest_FastText.shape = ", xtest_FastText.shape)

xtrain_FastText.shape =  (127656, 300)
xvalid_FastText.shape =  (31915, 300)
xtest_FastText.shape =  (153164, 300)


### SVM Classifier

In [8]:
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = SGDClassifier(loss='log', max_iter=1000, epsilon=0.001, n_jobs=-1, class_weight='balanced')

    cv_score = np.mean(cross_val_score(classifier, xtrain_FastText, train_l[class_name], cv=5, scoring='roc_auc'))
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_FastText, train_l[class_name])
    
    val_score = classifier.score(xvalid_FastText, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))

fit toxic
CV score for class toxic is 0.7967549344738636
Validation score for class toxic is 0.758765470781764


In [8]:
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = SGDClassifier(loss='log', max_iter=1000, epsilon=0.001, n_jobs=-1, class_weight='balanced')

    cv_score = np.mean(cross_val_score(classifier, xtrain_FastText, train_l[class_name], cv=5, scoring='roc_auc'))
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_FastText, train_l[class_name])
    
    val_score = classifier.score(xvalid_FastText, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))

fit toxic
CV score for class toxic is 0.7967549344738636
Validation score for class toxic is 0.758765470781764


### Logistic Regression Classifier

In [10]:
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = LogisticRegression(C=0.1, solver='sag', class_weight='balanced', max_iter=1000)
    
    cv_score = np.mean(cross_val_score(classifier, xtrain_FastText, train_l[class_name], cv=5, scoring='roc_auc'))
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_FastText, train_l[class_name])
    
    val_score = classifier.score(xvalid_FastText, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))

fit toxic
CV score for class toxic is 0.7969993287759112
Validation score for class toxic is 0.7860567131442895


### XGBoost Classifier

In [11]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'valid') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [12]:
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col = ['toxic']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(xtrain_FastText, train_l[j], xvalid_FastText, valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(xtest_FastText), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
[0]	train-auc:0.77959	valid-auc:0.76944
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[1]	train-auc:0.79937	valid-auc:0.78615
[2]	train-auc:0.80626	valid-auc:0.79430
[3]	train-auc:0.81165	valid-auc:0.79850
[4]	train-auc:0.81605	valid-auc:0.80091
[5]	train-auc:0.81868	valid-auc:0.80310
[6]	train-auc:0.82023	valid-auc:0.80477
[7]	train-auc:0.82232	valid-auc:0.80581
[8]	train-auc:0.82499	valid-auc:0.80657
[9]	train-auc:0.82614	valid-auc:0.80772
[10]	train-auc:0.82742	valid-auc:0.80840
[11]	train-auc:0.82886	valid-auc:0.80893
[12]	train-auc:0.83044	valid-auc:0.80936
[13]	train-auc:0.83203	valid-auc:0.81109
[14]	train-auc:0.83349	valid-auc:0.81186
[15]	train-auc:0.83542	valid-auc:0.81312
[16]	train-auc:0.83679	valid-auc:0.81397
[17]	train-auc:0.83806	valid-auc:0.81423
[18]	train-auc:0.83949	valid-auc:0.81507
[19]	train-auc:0.84084	valid-auc:0.81635
[20]	train-auc:0.84191	valid-auc:0.81

[195]	train-auc:0.94908	valid-auc:0.84937
[196]	train-auc:0.94937	valid-auc:0.84936
[197]	train-auc:0.94969	valid-auc:0.84937
[198]	train-auc:0.94994	valid-auc:0.84937
[199]	train-auc:0.95011	valid-auc:0.84934
[200]	train-auc:0.95038	valid-auc:0.84957
[201]	train-auc:0.95072	valid-auc:0.84949
[202]	train-auc:0.95110	valid-auc:0.84945
[203]	train-auc:0.95142	valid-auc:0.84944
[204]	train-auc:0.95153	valid-auc:0.84952
[205]	train-auc:0.95182	valid-auc:0.84953
[206]	train-auc:0.95218	valid-auc:0.84962
[207]	train-auc:0.95246	valid-auc:0.84968
[208]	train-auc:0.95281	valid-auc:0.84967
[209]	train-auc:0.95323	valid-auc:0.84962
Stopping. Best iteration:
[189]	train-auc:0.94710	valid-auc:0.84969



In [13]:
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('sample_submission_FastText_xgb.csv', index=False)