##### Read the data from csv

In [None]:
import pandas as pd
import os

# Set the working directory for the project
os.chdir('C://Users/dane.arnesen/Documents/Projects/kaggle/toxic_comments_challenge/')

# Development sample
dev = pd.read_csv('data/raw/train.csv')

# Validation sample
val = pd.read_csv('data/raw/test.csv')

print(dev.shape)
print(val.shape)

##### Get the target attributes and columns

In [None]:
y_cols = [c for c in dev.columns if c not in ['id','comment_text']]
y_vals = dev[y_cols].values
print(y_vals.shape)
print()
print(dev[y_cols].sum())

##### Truncate the validation comments to 5k characters

In [None]:
val = val.fillna('unknown')
val['comment_text'] = val['comment_text'].apply(lambda x: x[:5000])

##### Concatenate the dev and val comments into a single sample

In [None]:
# Number of rows in the dev sample
nrows = dev.shape[0]

# IDs in the val sample
vids = val['id'].values

# Combine the text from both the dev and val samples
df_txt = pd.concat([dev['comment_text'], val['comment_text']], axis=0)
print(df_txt.shape)

##### Cleanse the text and identify the tokens

In [None]:
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter

# Function that turns a doc into clean tokens
def clean_doc(doc, stemmer, stop_words):
    # Split into individual tokens by white space
    tokens = doc.split()
    # Remove punctuation and set to lowercase
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table).lower() for w in tokens]
    # Remove words that are not entirely alphabetical
    tokens = [w for w in tokens if w.isalpha()]
    # Removing all known stop words
    tokens = [w for w in tokens if not w in stop_words]
    # Remove tokens that aren't at least two characters in length
    tokens = [w for w in tokens if len(w) > 1]
    # Stem the remaining tokens
    tokens = [stemmer.stem(w) for w in tokens]
    return(tokens)

# Get a distinct list of stop words
stop_words = set(stopwords.words('english'))

# Initialize a stemmer
stemmer = SnowballStemmer('english')

# Define vocab
vocab = Counter()
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stemmer, stop_words)
    # Add tokens to vocab
    vocab.update(tokens)
    
# Cleanse the comments
lines = list()
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stemmer, stop_words)
    # Filter the words in the document by our defined vocabulary
    tokens = [w for w in tokens if w in vocab]
    # Concatentate each word in the document by a single space and append to our lines container
    lines.append(' '.join(tokens))

##### Vectorize the text using bigrams

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,4), min_df=5, strip_accents='unicode')

# Vectorize the text using Tfidf
data = vectorizer.fit_transform(lines)
print(data.shape)

##### Split the dev sample into training and testing data

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(data[:nrows], y_vals, test_size=0.2, random_state=1986)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

##### Train model on toxic comments

First create the baseline estimator.

In [None]:
from xgboost.sklearn import XGBClassifier

# Initialize the model
xg_toxic = XGBClassifier(learning_rate=0.1,
                         max_depth=20, 
                         subsample=0.5, 
                         colsample_bytree=0.5, 
                         n_estimators=6000, 
                         objective= 'binary:logistic',
                         eval_metric='logloss', 
                         n_jobs=-1
                        )

# Fit the model
xg_toxic.fit(X=x_train, 
             y=y_train[:,0], 
             verbose=10, 
             early_stopping_rounds=50, 
             eval_set=[(x_train, y_train[:,0]), (x_test, y_test[:,0])]
            )

Trim the dataset down to only the important predictors.

In [None]:
features = pd.Series(xg_toxic.get_booster().get_score(importance_type='gain')).sort_values(ascending=False).to_frame().reset_index()
features.columns = ['Feature','Importance']
features['ColInd'] = features['Feature'].str[1:].astype(int)
f_ind = features['ColInd'].values
print(f_ind.shape)

Perform hyperparameter tuning to see if we can get better results.

In [None]:
from sklearn.model_selection import GridSearchCV

# Setting the params to search
params = {'max_depth': [2, 3, 4, 5, 6]}

estimator = XGBClassifier(learning_rate=0.1,
                          max_depth=3, 
                          subsample=0.7, 
                          colsample_bytree=0.7, 
                          n_estimators=10000, 
                          objective= 'binary:logistic',
                          eval_metric='logloss', 
                          n_jobs=-1
                         )

# Initialize the grid search object
gs = GridSearchCV(estimator=estimator, param_grid=params, scoring='neg_log_loss', n_jobs=1, cv=3, verbose=3)

# Fitting the grid search object   
gs.fit(X=x_train[:,f_ind], 
       y=y_train[:,0], 
       verbose=10, 
       early_stopping_rounds=50, 
       eval_set=[(x_train[:,f_ind], y_train[:,0]), (x_test[:,f_ind], y_test[:,0])]
      )

print(gs1.best_params_)
print(gs1.best_score_)