In [1]:
import numpy as np
import pandas as pd
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import multiprocessing

In [2]:
dtypes = {
    'comment_text'   : np.unicode ,
    'toxic':         np.int16, 
    'severe_toxic': np.int16,
    'obscene': np.int16,
    'threat': np.int16,
    'insult': np.int16,
    'identity_hate': np.int16
}

train = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/train.csv', dtype=dtypes, encoding='utf-8')
test = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/test.csv', dtype=dtypes, encoding='utf-8')

train.comment_text.fillna("unknown", inplace=True)
test.comment_text.fillna("unknown",  inplace=True)

subm = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"].values.astype(str)}, dtype=np.str)

In [3]:
# Tokenization

re_tok = re.compile(u'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

### Google Word2Vec

In [4]:
# # Convertion of .bin to .txt file
# from gensim.models.keyedvectors import KeyedVectors

# model = KeyedVectors.load_word2vec_format('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Datasets/GoogleNews-vectors-negative300.bin', binary=True)
# model.save_word2vec_format('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/GoogleNews-vectors-negative300.txt', binary=False)

In [5]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],
                                                          train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
                                                          test_size=0.2, random_state=42)
train_mes = pd.DataFrame(train_mes)
valid_mes = pd.DataFrame(valid_mes)

In [6]:
# load the Google word2vec vectors in a dictionary:
from tqdm import tqdm

embeddings_index_google = {}
f = open('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/GoogleNews-vectors-negative300.txt', encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        continue
    embeddings_index_google[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index_google))

3000001it [05:55, 8430.19it/s]

Found 3000000 word vectors.





In [7]:
# Function to create a normalized vector for the whole sentence
def sent2vec(s, embeddings_index):
    words = str(s).lower()
    words = tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [8]:
gc.collect()

# Create sentence vectors using the above function for training and validation set
xtrain_google = [sent2vec(x, embeddings_index_google) for x in train_mes['comment_text']]
xvalid_google = [sent2vec(x, embeddings_index_google) for x in valid_mes['comment_text']]

xtrain_google = np.array(xtrain_google)
xvalid_google = np.array(xvalid_google)

print("xtrain_google.shape = ", xtrain_google.shape)
print("xvalid_google.shape = ", xvalid_google.shape)
 
# Generate Word vectors of test data
xtest_google = [sent2vec(x, embeddings_index_google) for x in test['comment_text']]
xtest_google = np.array(xtest_google)

xtrain_google.shape =  (127656, 300)
xvalid_google.shape =  (31915, 300)


### SGD Classifier

In [12]:
scores = []
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = SGDClassifier(loss='log', max_iter=1000, epsilon=0.001, n_jobs=-1)

    cv_score = np.mean(cross_val_score(classifier, xtrain_google, train_l[class_name], cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_google, train_l[class_name])
    preds[:,i] = classifier.predict_proba(xvalid_google)[:,1]
    
    val_score = classifier.score(xvalid_google, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))
    
print('\nTotal CV score is {}'.format(np.mean(scores)))

fit toxic
CV score for class toxic is 0.7786467063271928
Validation score for class toxic is 0.9083816387278709
fit severe_toxic
CV score for class severe_toxic is 0.9124718212335095
Validation score for class severe_toxic is 0.989942033526555
fit obscene
CV score for class obscene is 0.8248545995381784
Validation score for class obscene is 0.9483941720194266
fit threat
CV score for class threat is 0.866430355253675
Validation score for class threat is 0.9976813410621964
fit insult
CV score for class insult is 0.8221718870514358
Validation score for class insult is 0.9505561648127839
fit identity_hate
CV score for class identity_hate is 0.8154681296369934
Validation score for class identity_hate is 0.9907880307065643

Total CV score is 0.8366739165068308


In [13]:
testpreds_google = np.zeros((test.shape[0], len(col)))
for i, class_name in enumerate(col):
    print('fit '+ class_name)
    testpreds_google[:,i] = classifier.predict_proba(xtest_google)[:,1]
    
submission = pd.concat([submid, pd.DataFrame(testpreds_google, columns = col)], axis=1)
submission.to_csv('sample_submission_google_sgd.csv', index=False)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


### Logistic Regression Classifier

In [14]:
scores = []
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = LogisticRegression(C=0.1, solver='sag')
#    classifier = SVC(C=1.0, probability=True)

    cv_score = np.mean(cross_val_score(classifier, xtrain_google, train_l[class_name], cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_google, train_l[class_name])
    preds[:,i] = classifier.predict_proba(xvalid_google)[:,1]
    
    val_score = classifier.score(xvalid_google, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))

print('\nTotal CV score is {}'.format(np.mean(scores)))

fit toxic
CV score for class toxic is 0.7788775217940127
Validation score for class toxic is 0.9090396365345449
fit severe_toxic
CV score for class severe_toxic is 0.9125470180252113
Validation score for class severe_toxic is 0.989942033526555
fit obscene
CV score for class obscene is 0.8250653768834013
Validation score for class obscene is 0.9488641704527652
fit threat
CV score for class threat is 0.8665649806444659
Validation score for class threat is 0.9976813410621964
fit insult
CV score for class insult is 0.8224004324908399
Validation score for class insult is 0.9505561648127839
fit identity_hate
CV score for class identity_hate is 0.8155420763991653
Validation score for class identity_hate is 0.9907880307065643

Total CV score is 0.836832901039516


In [15]:
testpreds_google = np.zeros((test.shape[0], len(col)))
for i, class_name in enumerate(col):
    print('fit '+ class_name)
    testpreds_google[:,i] = classifier.predict_proba(xtest_google)[:,1]
    
submission = pd.concat([submid, pd.DataFrame(testpreds_google, columns = col)], axis=1)
submission.to_csv('sample_submission_google_logit.csv', index=False)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [None]:
scores = []
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = SVC(C=1.0, probability=True)

    cv_score = np.mean(cross_val_score(classifier, xtrain_google, train_l[class_name], cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_google, train_l[class_name])
    preds[:,i] = classifier.predict_proba(xvalid_google)[:,1]
    
    val_score = classifier.score(xvalid_google, valid_l[class_name])
    print('Validation score for class {} is {}'.format(class_name, val_score))

print('Total CV score is {}'.format(np.mean(scores)))

fit toxic


In [None]:
testpreds_google = np.zeros((test.shape[0], len(col)))
for i, class_name in enumerate(col):
    print('fit '+ class_name)
    testpreds_google[:,i] = classifier.predict_proba(xtest_google)[:,1]
    test_score = classifier.score(xtest_google)
    
submission = pd.concat([submid, pd.DataFrame(testpreds_google, columns = col)], axis=1)
submission.to_csv('sample_submission_google_svc.csv', index=False)

### XGBoost Classifier

In [15]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'valid') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [16]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(xtrain_google, train_l[j], xvalid_google, valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(xtest_google), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
[0]	train-auc:0.77237	valid-auc:0.76232
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[1]	train-auc:0.79239	valid-auc:0.77890
[2]	train-auc:0.79949	valid-auc:0.78562
[3]	train-auc:0.80285	valid-auc:0.78797
[4]	train-auc:0.80881	valid-auc:0.79109
[5]	train-auc:0.81184	valid-auc:0.79236
[6]	train-auc:0.81453	valid-auc:0.79402
[7]	train-auc:0.81656	valid-auc:0.79526
[8]	train-auc:0.81886	valid-auc:0.79620
[9]	train-auc:0.82152	valid-auc:0.79765
[10]	train-auc:0.82240	valid-auc:0.79817
[11]	train-auc:0.82398	valid-auc:0.79936
[12]	train-auc:0.82595	valid-auc:0.80060
[13]	train-auc:0.82786	valid-auc:0.80204
[14]	train-auc:0.82911	valid-auc:0.80300
[15]	train-auc:0.83077	valid-auc:0.80427
[16]	train-auc:0.83280	valid-auc:0.80630
[17]	train-auc:0.83425	valid-auc:0.80722
[18]	train-auc:0.83605	valid-auc:0.80855
[19]	train-auc:0.83673	valid-auc:0.80883
[20]	train-auc:0.83820	valid-auc:0.80

[195]	train-auc:0.94898	valid-auc:0.84482
[196]	train-auc:0.94911	valid-auc:0.84494
[197]	train-auc:0.94958	valid-auc:0.84485
[198]	train-auc:0.94998	valid-auc:0.84492
[199]	train-auc:0.95020	valid-auc:0.84493
[200]	train-auc:0.95049	valid-auc:0.84509
[201]	train-auc:0.95071	valid-auc:0.84520
[202]	train-auc:0.95113	valid-auc:0.84506
[203]	train-auc:0.95129	valid-auc:0.84514
[204]	train-auc:0.95166	valid-auc:0.84515
[205]	train-auc:0.95193	valid-auc:0.84533
[206]	train-auc:0.95225	valid-auc:0.84537
[207]	train-auc:0.95253	valid-auc:0.84541
[208]	train-auc:0.95293	valid-auc:0.84544
[209]	train-auc:0.95339	valid-auc:0.84536
[210]	train-auc:0.95356	valid-auc:0.84549
[211]	train-auc:0.95383	valid-auc:0.84545
[212]	train-auc:0.95418	valid-auc:0.84555
[213]	train-auc:0.95452	valid-auc:0.84555
[214]	train-auc:0.95471	valid-auc:0.84547
[215]	train-auc:0.95506	valid-auc:0.84551
[216]	train-auc:0.95531	valid-auc:0.84559
[217]	train-auc:0.95550	valid-auc:0.84567
[218]	train-auc:0.95583	valid-auc:

[4]	train-auc:0.85168	valid-auc:0.84024
[5]	train-auc:0.85398	valid-auc:0.84140
[6]	train-auc:0.85449	valid-auc:0.84181
[7]	train-auc:0.85572	valid-auc:0.84276
[8]	train-auc:0.85944	valid-auc:0.84526
[9]	train-auc:0.86047	valid-auc:0.84620
[10]	train-auc:0.86273	valid-auc:0.84693
[11]	train-auc:0.86461	valid-auc:0.84869
[12]	train-auc:0.86594	valid-auc:0.84888
[13]	train-auc:0.86808	valid-auc:0.85109
[14]	train-auc:0.86881	valid-auc:0.85138
[15]	train-auc:0.87016	valid-auc:0.85238
[16]	train-auc:0.87224	valid-auc:0.85353
[17]	train-auc:0.87355	valid-auc:0.85430
[18]	train-auc:0.87466	valid-auc:0.85477
[19]	train-auc:0.87496	valid-auc:0.85492
[20]	train-auc:0.87635	valid-auc:0.85531
[21]	train-auc:0.87754	valid-auc:0.85544
[22]	train-auc:0.87891	valid-auc:0.85609
[23]	train-auc:0.88053	valid-auc:0.85628
[24]	train-auc:0.88216	valid-auc:0.85758
[25]	train-auc:0.88394	valid-auc:0.85852
[26]	train-auc:0.88513	valid-auc:0.85957
[27]	train-auc:0.88675	valid-auc:0.85996
[28]	train-auc:0.88857

[72]	train-auc:0.99508	valid-auc:0.92803
[73]	train-auc:0.99531	valid-auc:0.92951
[74]	train-auc:0.99548	valid-auc:0.92932
[75]	train-auc:0.99581	valid-auc:0.93168
[76]	train-auc:0.99616	valid-auc:0.93243
[77]	train-auc:0.99673	valid-auc:0.93255
[78]	train-auc:0.99730	valid-auc:0.93187
[79]	train-auc:0.99737	valid-auc:0.93268
[80]	train-auc:0.99747	valid-auc:0.93340
[81]	train-auc:0.99784	valid-auc:0.93317
[82]	train-auc:0.99788	valid-auc:0.93389
[83]	train-auc:0.99801	valid-auc:0.93355
[84]	train-auc:0.99820	valid-auc:0.93256
[85]	train-auc:0.99829	valid-auc:0.93267
[86]	train-auc:0.99837	valid-auc:0.93317
[87]	train-auc:0.99852	valid-auc:0.93346
[88]	train-auc:0.99864	valid-auc:0.93481
[89]	train-auc:0.99880	valid-auc:0.93477
[90]	train-auc:0.99888	valid-auc:0.93572
[91]	train-auc:0.99890	valid-auc:0.93622
[92]	train-auc:0.99894	valid-auc:0.93672
[93]	train-auc:0.99896	valid-auc:0.93768
[94]	train-auc:0.99905	valid-auc:0.93620
[95]	train-auc:0.99914	valid-auc:0.93578
[96]	train-auc:0

[5]	train-auc:0.82876	valid-auc:0.77454
[6]	train-auc:0.82906	valid-auc:0.77335
[7]	train-auc:0.82970	valid-auc:0.77266
[8]	train-auc:0.83369	valid-auc:0.77467
[9]	train-auc:0.83457	valid-auc:0.77613
[10]	train-auc:0.84791	valid-auc:0.78585
[11]	train-auc:0.85170	valid-auc:0.78925
[12]	train-auc:0.85482	valid-auc:0.79432
[13]	train-auc:0.85864	valid-auc:0.79876
[14]	train-auc:0.85992	valid-auc:0.80090
[15]	train-auc:0.86551	valid-auc:0.80253
[16]	train-auc:0.86888	valid-auc:0.80782
[17]	train-auc:0.87239	valid-auc:0.80703
[18]	train-auc:0.87543	valid-auc:0.80949
[19]	train-auc:0.87750	valid-auc:0.81051
[20]	train-auc:0.87870	valid-auc:0.80957
[21]	train-auc:0.88037	valid-auc:0.80897
[22]	train-auc:0.88119	valid-auc:0.80804
[23]	train-auc:0.88292	valid-auc:0.80927
[24]	train-auc:0.88559	valid-auc:0.81046
[25]	train-auc:0.88967	valid-auc:0.81090
[26]	train-auc:0.89264	valid-auc:0.81346
[27]	train-auc:0.89406	valid-auc:0.81594
[28]	train-auc:0.89740	valid-auc:0.81742
[29]	train-auc:0.8984

In [17]:
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('sample_submission_google_xgb.csv', index=False)