## NV-SVM Baseline + XGBoost

##### Load the data

In [1]:
import pandas as pd
import os

# Set the working directory for the project
os.chdir('C://Users/dane.arnesen/Documents/Projects/kaggle/toxic_comments_challenge/')

# Development sample
dev = pd.read_csv('data/raw/train.csv')

# Validation sample
val = pd.read_csv('data/raw/test.csv')

print(dev.shape)
print(val.shape)
print(dev.columns)

(95851, 8)
(226998, 2)
Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')


##### Get the target attribute

In [2]:
y_cols = [c for c in dev.columns if c not in ['id','comment_text']]
y_vals = dev[y_cols].values
print(y_vals.shape)
print()
print(dev[y_cols].sum())

(95851, 6)

toxic            9237
severe_toxic      965
obscene          5109
threat            305
insult           4765
identity_hate     814
dtype: int64


##### Truncate the length of the validation comments to 5k characters

In [3]:
val = val.fillna('unknown')
val['comment_text'] = val['comment_text'].apply(lambda x: x[:5000])

##### Concatenate the comments from the dev and val samples

In [4]:
# Number of rows in the dev sample
nrows = dev.shape[0]

# IDs in the val sample
vids = val['id'].values

# Combine the text from both the dev and val samples
df_txt = pd.concat([dev['comment_text'], val['comment_text']], axis=0)
print(df_txt.shape)

(322849,)


##### Cleanse the comments text

In [5]:
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter

# Function that turns a doc into clean tokens
def clean_doc(doc, stemmer, stop_words):
    # Split into individual tokens by white space
    tokens = doc.split()
    # Remove punctuation and set to lowercase
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table).lower() for w in tokens]
    # Remove words that are not entirely alphabetical
    tokens = [w for w in tokens if w.isalpha()]
    # Removing all known stop words
    tokens = [w for w in tokens if not w in stop_words]
    # Remove tokens that aren't at least two characters in length
    tokens = [w for w in tokens if len(w) > 1]
    # Stem the remaining tokens
    tokens = [stemmer.stem(w) for w in tokens]
    return(tokens)

# Get a distinct list of stop words
stop_words = set(stopwords.words('english'))

# Initialize a stemmer
stemmer = SnowballStemmer('english')

# Define vocab
vocab = Counter()
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stemmer, stop_words)
    # Add tokens to vocab
    vocab.update(tokens)
    
# Cleanse the comments
lines = list()
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stemmer, stop_words)
    # Filter the words in the document by our defined vocabulary
    tokens = [w for w in tokens if w in vocab]
    # Concatentate each word in the document by a single space and append to our lines container
    lines.append(' '.join(tokens))

##### Vectorize the text

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,4), min_df=5, strip_accents='unicode', sublinear_tf=True)

# Vectorize the text using Tfidf
data = vectorizer.fit_transform(lines)
print(data.shape)

(322849, 113796)


##### Split sample into train and test sets

In [31]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(data[:nrows], y_vals, test_size=0.2, random_state=1986)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(76680, 113796)
(76680, 6)
(19171, 113796)
(19171, 6)


##### Create the NB feature equations

In [26]:
def nb(X, y_i, y):
    p = X[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def model(X, y):
    r = np.log(nb(X,1,y) / nb(X,0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = X.multiply(r)
    return m.fit(x_nb, y), r

##### Train the NB-SVM models one at a time and chain the predictions

In [32]:
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import log_loss

x_val = data[nrows:]
x_dev = data[:nrows]

# Initialize an empty container for our final predictions
preds = np.zeros((x_val.shape[0], len(y_cols)))
preds_dev = np.zeros((x_dev.shape[0], len(y_cols)))

# Iterate over each comment type and train a model
for i, j in enumerate(y_cols):
    print('Fitting', j)
    # Fit the model
    m, r = model(x_train, y_train[:,i])
    # Predict on the train
    y_prob_train = m.predict_proba(x_train.multiply(r))[:,1]
    # Predict on the test
    y_prob_test = m.predict_proba(x_test.multiply(r))[:,1]
    # Predict on the validation
    y_prob_val = m.predict_proba(x_val.multiply(r))[:,1]
    preds[:,i] = y_prob_val
    # Predict on the dev
    y_prob_dev = m.predict_proba(x_dev.multiply(r))[:,1]
    preds_dev[:,i] = y_prob_dev
    # Chain the predictions
    x_train = hstack([csr_matrix(y_prob_train).T, x_train], 'csr')
    x_test = hstack([csr_matrix(y_prob_test).T, x_test], 'csr')
    x_val = hstack([csr_matrix(y_prob_val).T, x_val], 'csr')
    x_dev = hstack([csr_matrix(y_prob_dev).T, x_dev], 'csr')
    # Show the logloss on the test sample
    print('Logloss: %0.5f' % log_loss(y_test[:,i], y_prob_test))
    print()

Fitting toxic
Logloss: 0.11131

Fitting severe_toxic
Logloss: 0.02814

Fitting obscene
Logloss: 0.05854

Fitting threat
Logloss: 0.01284

Fitting insult
Logloss: 0.07531

Fitting identity_hate
Logloss: 0.02546



##### NB-SVM predictions to csv files

In [33]:
ids = pd.DataFrame({'id': vids})
sub1 = pd.concat([ids, pd.DataFrame(preds, columns=y_cols)], axis=1)
sub1.to_csv('data/submissions/nb_svm.csv', index=False)
print(sub1.shape)

(226998, 7)


In [34]:
pd.DataFrame(preds_dev, columns=y_cols).to_csv('data/raw/nb_svm_preds.csv')

##### Training an XGBoost with prediction chaining

In [37]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,4), min_df=5, max_df=0.9, strip_accents='unicode')

# Vectorize the text using Tfidf
data = vectorizer.fit_transform(lines)
print(data.shape)

(322849, 113788)


In [38]:
# Split the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(data[:nrows], y_vals, test_size=0.3, random_state=1986)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(67095, 113788)
(67095, 6)
(28756, 113788)
(28756, 6)


In [40]:
from xgboost.sklearn import XGBClassifier

x_val = data[nrows:]
x_dev = data[:nrows]

# Initialize an empty container for our final predictions
xg_preds = np.zeros((x_val.shape[0], len(y_cols)))
xg_preds_dev = np.zeros((x_dev.shape[0], len(y_cols)))

# Initialize the model
xg = XGBClassifier(learning_rate=0.1,
                   max_depth=4, 
                   subsample=0.5, 
                   colsample_bytree=0.8, 
                   n_estimators=6000, 
                   objective= 'binary:logistic',
                   eval_metric='logloss', 
                   n_jobs=-1
                   )

# Iterate over each comment type and train a model
for i, j in enumerate(y_cols):
    print('Fitting', j)
    # Fitting the model
    xg.fit(X=x_train, 
           y=y_train[:,i], 
           verbose=10, 
           early_stopping_rounds=50, 
           eval_set=[(x_train, y_train[:,i]), (x_test, y_test[:,i])]
          )
    # Best iteration
    num_trees = xg.get_booster().best_iteration
    # Predict on the train
    y_prob_train = xg.predict_proba(x_train, ntree_limit=num_trees)[:,1]
    # Predict on the test
    y_prob_test = xg.predict_proba(x_test, ntree_limit=num_trees)[:,1]
    # Predict on the validation
    y_prob_val = xg.predict_proba(x_val, ntree_limit=num_trees)[:,1]
    xg_preds[:,i] = y_prob_val
    # Predict on the dev
    y_prob_dev = xg.predict_proba(x_dev, ntree_limit=num_trees)[:,1]
    xg_preds_dev[:,i] = y_prob_dev
    # Chain the predictions
    x_train = hstack([csr_matrix(y_prob_train).T, x_train], 'csr')
    x_test = hstack([csr_matrix(y_prob_test).T, x_test], 'csr')
    x_val = hstack([csr_matrix(y_prob_val).T, x_val], 'csr')
    x_dev = hstack([csr_matrix(y_prob_dev).T, x_dev], 'csr')
    # Show the logloss on the test sample
    print('Logloss: %0.5f' % log_loss(y_test[:,i], y_prob_test))
    print()

Fitting toxic
[0]	validation_0-logloss:0.619376	validation_1-logloss:0.619889
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 50 rounds.
[10]	validation_0-logloss:0.300305	validation_1-logloss:0.303584
[20]	validation_0-logloss:0.220299	validation_1-logloss:0.225829
[30]	validation_0-logloss:0.191706	validation_1-logloss:0.199682
[40]	validation_0-logloss:0.176657	validation_1-logloss:0.186388
[50]	validation_0-logloss:0.166483	validation_1-logloss:0.177794
[60]	validation_0-logloss:0.158541	validation_1-logloss:0.171064
[70]	validation_0-logloss:0.151698	validation_1-logloss:0.165426
[80]	validation_0-logloss:0.146018	validation_1-logloss:0.161026
[90]	validation_0-logloss:0.14127	validation_1-logloss:0.156977
[100]	validation_0-logloss:0.136931	validation_1-logloss:0.153727
[110]	validation_0-logloss:0.133311	validation_1-logloss:0.150925
[120]	validation_0-logloss:0.129652	valid

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


Logloss: nan

Fitting severe_toxic
[0]	validation_0-logloss:0.60081	validation_1-logloss:0.600929
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 50 rounds.
[10]	validation_0-logloss:0.191536	validation_1-logloss:0.19247
[20]	validation_0-logloss:0.079437	validation_1-logloss:0.081252
[30]	validation_0-logloss:0.04147	validation_1-logloss:0.044182
[40]	validation_0-logloss:0.027463	validation_1-logloss:0.031079
[50]	validation_0-logloss:0.021864	validation_1-logloss:0.026519
[60]	validation_0-logloss:0.01929	validation_1-logloss:0.024816
[70]	validation_0-logloss:0.017886	validation_1-logloss:0.024305
[80]	validation_0-logloss:0.016971	validation_1-logloss:0.023953
[90]	validation_0-logloss:0.016155	validation_1-logloss:0.023936
[100]	validation_0-logloss:0.015521	validation_1-logloss:0.023909
[110]	validation_0-logloss:0.014953	validation_1-logloss:0.02402
[120]	validation_0-loglo

##### XGBoost predictions to file

In [41]:
ids = pd.DataFrame({'id': vids})
sub1 = pd.concat([ids, pd.DataFrame(xg_preds, columns=y_cols)], axis=1)
sub1.to_csv('data/submissions/xg_chained.csv', index=False)
print(sub1.shape)

(226998, 7)


In [42]:
pd.DataFrame(xg_preds_dev, columns=y_cols).to_csv('data/raw/xg_chain_preds.csv')