In [1]:
import numpy as np
import pandas as pd
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import multiprocessing

In [2]:
dtypes = {
    'comment_text'   : np.unicode ,
    'toxic':         np.int16, 
    'severe_toxic': np.int16,
    'obscene': np.int16,
    'threat': np.int16,
    'insult': np.int16,
    'identity_hate': np.int16
}

train = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/train.csv', dtype=dtypes, encoding='utf-8')
test = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/test.csv', dtype=dtypes, encoding='utf-8')

train.comment_text.fillna("unknown", inplace=True)
test.comment_text.fillna("unknown",  inplace=True)

subm = pd.read_csv('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Data17/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"].values.astype(str)}, dtype=np.str)

In [3]:
# Tokenization

re_tok = re.compile(u'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

### FastText

In [5]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],
                                                          train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
                                                          test_size=0.2, random_state=42)
train_mes = pd.DataFrame(train_mes)
valid_mes = pd.DataFrame(valid_mes)

In [6]:
# load the FastText vectors in a dictionary:
from tqdm import tqdm

embeddings_index_FastText = {}
f = open('C:/Users/abandyop/Desktop/Personal/Data Mining/Project/Datasets/crawl-300d-2M.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        continue
    embeddings_index_FastText[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index_FastText))

1999996it [03:19, 10014.87it/s]

Found 1999996 word vectors.





In [7]:
# Function to create a normalized vector for the whole sentence
def sent2vec(s, embeddings_index):
    words = str(s).lower()
    words = tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [8]:
gc.collect()

# Create sentence vectors using the above function for training and validation set
xtrain_FastText = [sent2vec(x, embeddings_index_FastText) for x in train_mes['comment_text']]
xvalid_FastText = [sent2vec(x, embeddings_index_FastText) for x in valid_mes['comment_text']]

xtrain_FastText = np.array(xtrain_FastText)
xvalid_FastText = np.array(xvalid_FastText)

print("xtrain_FastText.shape = ", xtrain_FastText.shape)
print("xvalid_FastText.shape = ", xvalid_FastText.shape)
 
# Generate Word vectors of test data
xtest_FastText = [sent2vec(x, embeddings_index_FastText) for x in test['comment_text']]
xtest_FastText = np.array(xtest_FastText)

xtrain_FastText.shape =  (127656, 300)
xvalid_FastText.shape =  (31915, 300)


### SGD Classifier

In [9]:
scores = []
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = SGDClassifier(loss='log', max_iter=1000, epsilon=0.001, n_jobs=-1)

    cv_score = np.mean(cross_val_score(classifier, xtrain_FastText, train_l[class_name], cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_FastText, train_l[class_name])
    preds[:,i] = classifier.predict_proba(xvalid_FastText)[:,1]
    
print('Total CV score is {}'.format(np.mean(scores)))

fit toxic
CV score for class toxic is 0.778880261178883
fit severe_toxic
CV score for class severe_toxic is 0.9014068553955387
fit obscene
CV score for class obscene is 0.818231838029696
fit threat
CV score for class threat is 0.8586549413107658
fit insult
CV score for class insult is 0.8165125034213844
fit identity_hate
CV score for class identity_hate is 0.8151732642551238
Total CV score is 0.8314766105985653


In [10]:
testpreds_FastText = np.zeros((test.shape[0], len(col)))
for i, class_name in enumerate(col):
    print('fit '+ class_name)
    testpreds_FastText[:,i] = classifier.predict_proba(xtest_FastText)[:,1]
    
submission = pd.concat([submid, pd.DataFrame(testpreds_FastText, columns = col)], axis=1)
submission.to_csv('sample_submission_FastText_sgd.csv', index=False)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


### Logistic Regression Classifier

In [11]:
scores = []
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((valid_mes.shape[0], len(col))).astype(object)

for i, class_name in enumerate(col):
    print('fit '+ class_name)
    classifier = LogisticRegression(C=0.1, solver='sag')
#    classifier = SVC(C=1.0, probability=True)

    cv_score = np.mean(cross_val_score(classifier, xtrain_FastText, train_l[class_name], cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(xtrain_FastText, train_l[class_name])
    preds[:,i] = classifier.predict_proba(xvalid_FastText)[:,1]
    
print('Total CV score is {}'.format(np.mean(scores)))

fit toxic
CV score for class toxic is 0.7792949071732895
fit severe_toxic
CV score for class severe_toxic is 0.9015847215229795
fit obscene
CV score for class obscene is 0.8186233127456995
fit threat
CV score for class threat is 0.8587920060063053
fit insult
CV score for class insult is 0.8167887840205079
fit identity_hate
CV score for class identity_hate is 0.8153501588383577
Total CV score is 0.8317389817178565


In [12]:
testpreds_FastText = np.zeros((test.shape[0], len(col)))
for i, class_name in enumerate(col):
    print('fit '+ class_name)
    testpreds_FastText[:,i] = classifier.predict_proba(xtest_FastText)[:,1]
    
submission = pd.concat([submid, pd.DataFrame(testpreds_FastText, columns = col)], axis=1)
submission.to_csv('sample_submission_FastText_logit.csv', index=False)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


### XGBoost Classifier

In [13]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'valid') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [14]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(xtrain_FastText, train_l[j], xvalid_FastText, valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(xtest_FastText), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
[0]	train-auc:0.77959	valid-auc:0.76944
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[1]	train-auc:0.79937	valid-auc:0.78615
[2]	train-auc:0.80626	valid-auc:0.79430
[3]	train-auc:0.81165	valid-auc:0.79850
[4]	train-auc:0.81605	valid-auc:0.80091
[5]	train-auc:0.81868	valid-auc:0.80310
[6]	train-auc:0.82023	valid-auc:0.80477
[7]	train-auc:0.82232	valid-auc:0.80581
[8]	train-auc:0.82499	valid-auc:0.80657
[9]	train-auc:0.82614	valid-auc:0.80772
[10]	train-auc:0.82742	valid-auc:0.80840
[11]	train-auc:0.82886	valid-auc:0.80893
[12]	train-auc:0.83044	valid-auc:0.80936
[13]	train-auc:0.83203	valid-auc:0.81109
[14]	train-auc:0.83349	valid-auc:0.81186
[15]	train-auc:0.83542	valid-auc:0.81312
[16]	train-auc:0.83679	valid-auc:0.81397
[17]	train-auc:0.83806	valid-auc:0.81423
[18]	train-auc:0.83949	valid-auc:0.81507
[19]	train-auc:0.84084	valid-auc:0.81635
[20]	train-auc:0.84191	valid-auc:0.81

[195]	train-auc:0.94908	valid-auc:0.84937
[196]	train-auc:0.94937	valid-auc:0.84936
[197]	train-auc:0.94969	valid-auc:0.84937
[198]	train-auc:0.94994	valid-auc:0.84937
[199]	train-auc:0.95011	valid-auc:0.84934
[200]	train-auc:0.95038	valid-auc:0.84957
[201]	train-auc:0.95072	valid-auc:0.84949
[202]	train-auc:0.95110	valid-auc:0.84945
[203]	train-auc:0.95142	valid-auc:0.84944
[204]	train-auc:0.95153	valid-auc:0.84952
[205]	train-auc:0.95182	valid-auc:0.84953
[206]	train-auc:0.95218	valid-auc:0.84962
[207]	train-auc:0.95246	valid-auc:0.84968
[208]	train-auc:0.95281	valid-auc:0.84967
[209]	train-auc:0.95323	valid-auc:0.84962
Stopping. Best iteration:
[189]	train-auc:0.94710	valid-auc:0.84969

fit severe_toxic
[0]	train-auc:0.85766	valid-auc:0.83768
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[1]	train-auc:0.89979	valid-auc:0.88532
[2]	train-auc:0.91025	valid-auc:0.89689
[3]	train-auc:0.91253

[73]	train-auc:0.93511	valid-auc:0.88040
[74]	train-auc:0.93593	valid-auc:0.88046
[75]	train-auc:0.93690	valid-auc:0.88091
[76]	train-auc:0.93778	valid-auc:0.88087
[77]	train-auc:0.93826	valid-auc:0.88090
[78]	train-auc:0.93899	valid-auc:0.88093
[79]	train-auc:0.93970	valid-auc:0.88098
[80]	train-auc:0.94049	valid-auc:0.88102
[81]	train-auc:0.94107	valid-auc:0.88123
[82]	train-auc:0.94171	valid-auc:0.88147
[83]	train-auc:0.94210	valid-auc:0.88144
[84]	train-auc:0.94245	valid-auc:0.88132
[85]	train-auc:0.94290	valid-auc:0.88159
[86]	train-auc:0.94342	valid-auc:0.88208
[87]	train-auc:0.94392	valid-auc:0.88250
[88]	train-auc:0.94443	valid-auc:0.88259
[89]	train-auc:0.94494	valid-auc:0.88303
[90]	train-auc:0.94540	valid-auc:0.88296
[91]	train-auc:0.94577	valid-auc:0.88276
[92]	train-auc:0.94638	valid-auc:0.88279
[93]	train-auc:0.94673	valid-auc:0.88284
[94]	train-auc:0.94701	valid-auc:0.88292
[95]	train-auc:0.94753	valid-auc:0.88315
[96]	train-auc:0.94795	valid-auc:0.88336
[97]	train-auc:0

[9]	train-auc:0.86525	valid-auc:0.83874
[10]	train-auc:0.86661	valid-auc:0.84013
[11]	train-auc:0.86838	valid-auc:0.84093
[12]	train-auc:0.86969	valid-auc:0.84281
[13]	train-auc:0.87121	valid-auc:0.84389
[14]	train-auc:0.87209	valid-auc:0.84345
[15]	train-auc:0.87354	valid-auc:0.84408
[16]	train-auc:0.87505	valid-auc:0.84507
[17]	train-auc:0.87692	valid-auc:0.84600
[18]	train-auc:0.87932	valid-auc:0.84746
[19]	train-auc:0.88145	valid-auc:0.84911
[20]	train-auc:0.88242	valid-auc:0.85005
[21]	train-auc:0.88385	valid-auc:0.85054
[22]	train-auc:0.88520	valid-auc:0.85092
[23]	train-auc:0.88665	valid-auc:0.85225
[24]	train-auc:0.88758	valid-auc:0.85298
[25]	train-auc:0.88888	valid-auc:0.85402
[26]	train-auc:0.89007	valid-auc:0.85414
[27]	train-auc:0.89166	valid-auc:0.85448
[28]	train-auc:0.89286	valid-auc:0.85468
[29]	train-auc:0.89431	valid-auc:0.85542
[30]	train-auc:0.89568	valid-auc:0.85607
[31]	train-auc:0.89652	valid-auc:0.85703
[32]	train-auc:0.89756	valid-auc:0.85739
[33]	train-auc:0.

[8]	train-auc:0.86070	valid-auc:0.79721
[9]	train-auc:0.86059	valid-auc:0.79745
[10]	train-auc:0.86176	valid-auc:0.79769
[11]	train-auc:0.86817	valid-auc:0.79932
[12]	train-auc:0.86894	valid-auc:0.79829
[13]	train-auc:0.87374	valid-auc:0.80427
[14]	train-auc:0.87626	valid-auc:0.80722
[15]	train-auc:0.87652	valid-auc:0.80829
[16]	train-auc:0.88046	valid-auc:0.81161
[17]	train-auc:0.88180	valid-auc:0.81309
[18]	train-auc:0.88419	valid-auc:0.81728
[19]	train-auc:0.88458	valid-auc:0.81865
[20]	train-auc:0.88523	valid-auc:0.82039
[21]	train-auc:0.89000	valid-auc:0.82568
[22]	train-auc:0.89182	valid-auc:0.82853
[23]	train-auc:0.89565	valid-auc:0.82918
[24]	train-auc:0.89905	valid-auc:0.83205
[25]	train-auc:0.89995	valid-auc:0.83546
[26]	train-auc:0.90253	valid-auc:0.83430
[27]	train-auc:0.90592	valid-auc:0.83389
[28]	train-auc:0.90788	valid-auc:0.83542
[29]	train-auc:0.91049	valid-auc:0.83652
[30]	train-auc:0.91192	valid-auc:0.83557
[31]	train-auc:0.91454	valid-auc:0.83557
[32]	train-auc:0.9

In [15]:
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('sample_submission_FastText_xgb.csv', index=False)