### Kaggle Toxic Comment Classification Challenge - Baseline Classifier

The competition can be found at the following url:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

#### 1. Preparing the data

In [1]:
import pandas as pd
import os

os.chdir('D://Analytics/Kaggle/toxic_comment_challenge/')
print(os.getcwd())

dev = pd.read_csv('data/raw/train.csv')
val = pd.read_csv('data/raw/test.csv')
print(dev.shape)
print(val.shape)

D:\Analytics\Kaggle\toxic_comment_challenge
(95851, 8)
(226998, 2)


In [2]:
dev.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [3]:
# Identifying the target columns
y_cols = [c for c in dev.columns if c not in ['id','comment_text']]
y_vals = dev[y_cols].as_matrix()

#dev['target'] = 7
#dev['target'] = dev.apply(lambda x: 1 if (x['toxic'] == 1) else x['target'], axis=1)
#dev['target'] = dev.apply(lambda x: 2 if (x['severe_toxic'] == 1) else x['target'], axis=1)
#dev['target'] = dev.apply(lambda x: 3 if (x['obscene'] == 1) else x['target'], axis=1)
#dev['target'] = dev.apply(lambda x: 4 if (x['threat'] == 1) else x['target'], axis=1)
#dev['target'] = dev.apply(lambda x: 5 if (x['insult'] == 1) else x['target'], axis=1)
#dev['target'] = dev.apply(lambda x: 6 if (x['identity_hate'] == 1) else x['target'], axis=1)
#print(dev['target'].nunique())

# Flagging the validation ids
vid = val['id'].values

# Concatenating the dev and val datasets
df_txt = pd.concat([dev['comment_text'], val['comment_text']], axis=0)
df_txt = df_txt.fillna("unknown")

# Number of rows in the dev sample
nrows = dev.shape[0]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the text to Tfidf format
tfidf = TfidfVectorizer(stop_words='english', max_features=50000)
data = tfidf.fit_transform(df_txt)
print(data.shape)

(322849, 50000)


In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data[:nrows], y_vals, test_size=.2, random_state=52)

#### 2. Baseline XGBoost - training one model per target class

In [14]:
from xgboost import XGBClassifier
import numpy as np

preds = np.zeros((val.shape[0], len(y_cols)))

for i, c in enumerate(y_cols):
    print('Fitting %s' % c)
    
    # Initialize the model parameters
    xgb = XGBClassifier(learning_rate=0.1,
                        max_depth=3,
                        n_estimators=6000,
                        objective='binary:logistic',
                        eval_metric='logloss',
                        n_jobs=-1
                        )

    # Train the model
    xgb.fit(x_train,
            y_train[:,i],
            verbose=10,
            early_stopping_rounds=50,
            eval_set=[(x_train, y_train[:,i]), (x_test, y_test[:,i])]
            )
    
    # Best iteration
    num_trees = xgb.get_booster().best_iteration
    
    # Predictions
    preds[:,i] = xgb.predict_proba(data[nrows:], ntree_limit=num_trees)[:,1]    

Fitting toxic
[0]	validation_0-logloss:0.622879	validation_1-logloss:0.6236
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 50 rounds.
[10]	validation_0-logloss:0.322206	validation_1-logloss:0.327597
[20]	validation_0-logloss:0.251479	validation_1-logloss:0.25971
[30]	validation_0-logloss:0.22831	validation_1-logloss:0.238403
[40]	validation_0-logloss:0.216274	validation_1-logloss:0.227646
[50]	validation_0-logloss:0.207794	validation_1-logloss:0.22027
[60]	validation_0-logloss:0.201162	validation_1-logloss:0.214037
[70]	validation_0-logloss:0.195404	validation_1-logloss:0.208922
[80]	validation_0-logloss:0.190674	validation_1-logloss:0.20438
[90]	validation_0-logloss:0.186505	validation_1-logloss:0.200875
[100]	validation_0-logloss:0.182777	validation_1-logloss:0.197412
[110]	validation_0-logloss:0.179418	validation_1-logloss:0.194202
[120]	validation_0-logloss:0.176315	validation

[1220]	validation_0-logloss:0.102483	validation_1-logloss:0.137337
[1230]	validation_0-logloss:0.102238	validation_1-logloss:0.137229
[1240]	validation_0-logloss:0.101988	validation_1-logloss:0.13713
[1250]	validation_0-logloss:0.101747	validation_1-logloss:0.136989
[1260]	validation_0-logloss:0.101513	validation_1-logloss:0.136831
[1270]	validation_0-logloss:0.101276	validation_1-logloss:0.136773
[1280]	validation_0-logloss:0.101046	validation_1-logloss:0.136692
[1290]	validation_0-logloss:0.100805	validation_1-logloss:0.136608
[1300]	validation_0-logloss:0.100588	validation_1-logloss:0.136616
[1310]	validation_0-logloss:0.10036	validation_1-logloss:0.136552
[1320]	validation_0-logloss:0.100132	validation_1-logloss:0.136468
[1330]	validation_0-logloss:0.099913	validation_1-logloss:0.136362
[1340]	validation_0-logloss:0.099689	validation_1-logloss:0.136331
[1350]	validation_0-logloss:0.099464	validation_1-logloss:0.136236
[1360]	validation_0-logloss:0.099248	validation_1-logloss:0.1361

[2450]	validation_0-logloss:0.081385	validation_1-logloss:0.13117
[2460]	validation_0-logloss:0.081266	validation_1-logloss:0.131139
[2470]	validation_0-logloss:0.081141	validation_1-logloss:0.131126
[2480]	validation_0-logloss:0.08103	validation_1-logloss:0.131083
[2490]	validation_0-logloss:0.080912	validation_1-logloss:0.131068
[2500]	validation_0-logloss:0.080798	validation_1-logloss:0.131048
[2510]	validation_0-logloss:0.080667	validation_1-logloss:0.131058
[2520]	validation_0-logloss:0.080533	validation_1-logloss:0.131065
[2530]	validation_0-logloss:0.080406	validation_1-logloss:0.130999
[2540]	validation_0-logloss:0.080287	validation_1-logloss:0.131001
[2550]	validation_0-logloss:0.080177	validation_1-logloss:0.130988
[2560]	validation_0-logloss:0.08007	validation_1-logloss:0.130957
[2570]	validation_0-logloss:0.079954	validation_1-logloss:0.13098
[2580]	validation_0-logloss:0.079835	validation_1-logloss:0.13098
[2590]	validation_0-logloss:0.079704	validation_1-logloss:0.130995


[430]	validation_0-logloss:0.056598	validation_1-logloss:0.067532
[440]	validation_0-logloss:0.056244	validation_1-logloss:0.067322
[450]	validation_0-logloss:0.055878	validation_1-logloss:0.06714
[460]	validation_0-logloss:0.055572	validation_1-logloss:0.066934
[470]	validation_0-logloss:0.055274	validation_1-logloss:0.066738
[480]	validation_0-logloss:0.054947	validation_1-logloss:0.066627
[490]	validation_0-logloss:0.05465	validation_1-logloss:0.066501
[500]	validation_0-logloss:0.05433	validation_1-logloss:0.066302
[510]	validation_0-logloss:0.054063	validation_1-logloss:0.066113
[520]	validation_0-logloss:0.053778	validation_1-logloss:0.065968
[530]	validation_0-logloss:0.053487	validation_1-logloss:0.065835
[540]	validation_0-logloss:0.053211	validation_1-logloss:0.065694
[550]	validation_0-logloss:0.05296	validation_1-logloss:0.065555
[560]	validation_0-logloss:0.0527	validation_1-logloss:0.065443
[570]	validation_0-logloss:0.052471	validation_1-logloss:0.065317
[580]	validation

[350]	validation_0-logloss:0.007065	validation_1-logloss:0.013276
Stopping. Best iteration:
[309]	validation_0-logloss:0.007333	validation_1-logloss:0.013266

Fitting insult
[0]	validation_0-logloss:0.611245	validation_1-logloss:0.611574
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 50 rounds.
[10]	validation_0-logloss:0.254458	validation_1-logloss:0.257144
[20]	validation_0-logloss:0.164254	validation_1-logloss:0.168654
[30]	validation_0-logloss:0.136677	validation_1-logloss:0.142487
[40]	validation_0-logloss:0.125667	validation_1-logloss:0.132364
[50]	validation_0-logloss:0.119169	validation_1-logloss:0.126495
[60]	validation_0-logloss:0.114474	validation_1-logloss:0.122601
[70]	validation_0-logloss:0.110869	validation_1-logloss:0.119446
[80]	validation_0-logloss:0.107818	validation_1-logloss:0.116876
[90]	validation_0-logloss:0.105282	validation_1-logloss:0.114899
[100]	valida

[1200]	validation_0-logloss:0.058538	validation_1-logloss:0.086502
[1210]	validation_0-logloss:0.058395	validation_1-logloss:0.086454
[1220]	validation_0-logloss:0.058262	validation_1-logloss:0.086414
[1230]	validation_0-logloss:0.058128	validation_1-logloss:0.086433
[1240]	validation_0-logloss:0.058	validation_1-logloss:0.08637
[1250]	validation_0-logloss:0.057865	validation_1-logloss:0.086383
[1260]	validation_0-logloss:0.057739	validation_1-logloss:0.086342
[1270]	validation_0-logloss:0.057604	validation_1-logloss:0.086342
[1280]	validation_0-logloss:0.057477	validation_1-logloss:0.086294
[1290]	validation_0-logloss:0.057358	validation_1-logloss:0.086309
[1300]	validation_0-logloss:0.057228	validation_1-logloss:0.08627
[1310]	validation_0-logloss:0.057103	validation_1-logloss:0.086289
[1320]	validation_0-logloss:0.056963	validation_1-logloss:0.086257
[1330]	validation_0-logloss:0.056836	validation_1-logloss:0.086203
[1340]	validation_0-logloss:0.056713	validation_1-logloss:0.086178


[680]	validation_0-logloss:0.015684	validation_1-logloss:0.028225
[690]	validation_0-logloss:0.015613	validation_1-logloss:0.028247
[700]	validation_0-logloss:0.015543	validation_1-logloss:0.02822
Stopping. Best iteration:
[655]	validation_0-logloss:0.015857	validation_1-logloss:0.0282



#### 3. Kaggle submission

In [15]:
submid = pd.DataFrame({'id': vid})
submission = pd.concat([submid, pd.DataFrame(preds, columns=y_cols)], axis=1)
submission.to_csv('data/submissions/baseline.csv', index=False)