#### 1. Load the data

In [1]:
import pandas as pd
import os

os.chdir('D://Analytics/Kaggle/toxic_comment_challenge/')
print(os.getcwd())

dev = pd.read_csv('data/raw/train.csv')
val = pd.read_csv('data/raw/test.csv')
print(dev.shape)
print(val.shape)
print(dev.head())

D:\Analytics\Kaggle\toxic_comment_challenge
(95851, 8)
(226998, 2)
         id                                       comment_text  toxic  \
0  22256635  Nonsense?  kiss off, geek. what I said is true...      1   
1  27450690  "\n\n Please do not vandalize pages, as you di...      0   
2  54037174  "\n\n ""Points of interest"" \n\nI removed the...      0   
3  77493077  Asking some his nationality is a Racial offenc...      0   
4  79357270  The reader here is not going by my say so for ...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


In [2]:
# Identifying the target columns
y_cols = [c for c in dev.columns if c not in ['id','comment_text']]
y_vals = dev[y_cols].as_matrix()

# Flagging the validation ids
vid = val['id'].values

# Concatenating the dev and val datasets
df_txt = pd.concat([dev['comment_text'], val['comment_text']], axis=0)
df_txt = df_txt.fillna("unknown")

# Number of rows in the dev sample
nrows = dev.shape[0]

#### 2. LSTM Feeder Model

In [3]:
import string
from nltk.corpus import stopwords

# Function that turns a doc into clean tokens
def clean_doc(doc, stop_words):
    # Split into individual tokens by white space
    tokens = doc.split()
    # Remove punctuation and set to lowercase
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table).lower() for w in tokens]
    # Remove words that are not entirely alphabetical
    #tokens = [w for w in tokens if w.isalpha()]
    # Removing all known stop words
    tokens = [w for w in tokens if not w in stop_words]
    # Remove tokens that aren't at least two characters in length
    tokens = [w for w in tokens if len(w) > 1]
    return(tokens)

In [4]:
from collections import Counter

# Define vocab
vocab = Counter()

# Get a distinct list of stop words
stop_words = set(stopwords.words('english'))

# Iterate over each of the texts in our training sample
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stop_words)
    # Add tokens to vocab
    vocab.update(tokens)

In [5]:
# A container object that will hold the words of each individual document
lines = list()

# Iterate over each of the texts in our training sample
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stop_words)
    # Filter the words in the document by our defined vocabulary
    tokens = [w for w in tokens if w in vocab]
    # Concatentate each word in the document by a single space and append to our lines container
    lines.append(' '.join(tokens))

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words = 50000
max_length = 500

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
data = pad_sequences(sequences, maxlen=max_length)
print(data.shape)

Using TensorFlow backend.


(322849, 500)


In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data[:nrows], y_vals, test_size=.5, random_state=52)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(47925, 500)
(47925, 6)
(47926, 500)
(47926, 6)


In [8]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create the model
model = Sequential()
model.add(Embedding(num_words, 128, input_length=max_length))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128, recurrent_dropout=0.15))
model.add(Dense(6, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=2, verbose=True)

# Checkpoint - used to get the best weights during the model training process
checkpoint = ModelCheckpoint(filepath='models/feeder_best_weights.h5', monitor='val_loss', save_best_only=True)

# Train the model
model.fit(x_train, 
          y_train, 
          validation_data=(x_test, y_test), 
          epochs=20, 
          batch_size=256,
          callbacks=[early_stopping, checkpoint],
          verbose=2
          )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 128)          6400000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 496, 64)           41024     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 124, 64)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 774       
Total params: 6,540,614
Trainable params: 6,540,614
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x2ad7d43ccc0>

In [10]:
from keras.models import load_model

model = load_model('models/feeder_best_weights.h5')

preds = model.predict(data[:nrows])
print(preds.shape)

(95851, 6)


In [11]:
preds_val = model.predict(data[nrows:])
print(preds_val.shape)

(226998, 6)


#### 3. Preping the data for XGBoost

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the text to Tfidf format
tfidf = TfidfVectorizer(stop_words='english', max_features=50000)
data = tfidf.fit_transform(df_txt)
print(data.shape)

(322849, 50000)


In [21]:
import numpy as np
from scipy import sparse

data_res = sparse.hstack((preds, data[:nrows]))
data_val_res = sparse.hstack((preds_val, data[nrows:]))
print(data_res.shape)
print(data_val_res.shape)

(95851, 50006)
(226998, 50006)


In [25]:
x_test, x_train, y_test, y_train = train_test_split(data_res, y_vals, test_size=.2, random_state=52)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(19171, 50006)
(19171, 6)
(76680, 50006)
(76680, 6)


In [26]:
from xgboost import XGBClassifier

preds = np.zeros((val.shape[0], len(y_cols)))

for i, c in enumerate(y_cols):
    print('Fitting %s' % c)
    
    # Initialize the model parameters
    xgb = XGBClassifier(learning_rate=0.05,
                        max_depth=4,
                        n_estimators=6000,
                        objective='binary:logistic',
                        eval_metric='logloss',
                        n_jobs=-1
                        )

    # Train the model
    xgb.fit(x_train,
            y_train[:,i],
            verbose=10,
            early_stopping_rounds=50,
            eval_set=[(x_train, y_train[:,i]), (x_test, y_test[:,i])]
            )
    
    # Best iteration
    num_trees = xgb.get_booster().best_iteration
    
    # Predictions
    preds[:,i] = xgb.predict_proba(data_val_res, ntree_limit=num_trees)[:,1]    

Fitting toxic
[0]	validation_0-logloss:0.650971	validation_1-logloss:0.649982
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 50 rounds.
[10]	validation_0-logloss:0.385574	validation_1-logloss:0.377688
[20]	validation_0-logloss:0.261263	validation_1-logloss:0.249082
[30]	validation_0-logloss:0.196292	validation_1-logloss:0.181296
[40]	validation_0-logloss:0.160668	validation_1-logloss:0.143843
[50]	validation_0-logloss:0.140505	validation_1-logloss:0.122577
[60]	validation_0-logloss:0.12866	validation_1-logloss:0.110514
[70]	validation_0-logloss:0.121588	validation_1-logloss:0.103544
[80]	validation_0-logloss:0.117366	validation_1-logloss:0.099645
[90]	validation_0-logloss:0.114565	validation_1-logloss:0.09748
[100]	validation_0-logloss:0.112472	validation_1-logloss:0.096286
[110]	validation_0-logloss:0.110782	validation_1-logloss:0.095537
[120]	validation_0-logloss:0.109468	valida

[180]	validation_0-logloss:0.007285	validation_1-logloss:0.01199
[190]	validation_0-logloss:0.007083	validation_1-logloss:0.012018
[200]	validation_0-logloss:0.006925	validation_1-logloss:0.012018
[210]	validation_0-logloss:0.006787	validation_1-logloss:0.012037
[220]	validation_0-logloss:0.006654	validation_1-logloss:0.012083
[230]	validation_0-logloss:0.006526	validation_1-logloss:0.012089
Stopping. Best iteration:
[180]	validation_0-logloss:0.007285	validation_1-logloss:0.01199

Fitting insult
[0]	validation_0-logloss:0.648734	validation_1-logloss:0.64855
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 50 rounds.
[10]	validation_0-logloss:0.368623	validation_1-logloss:0.367403
[20]	validation_0-logloss:0.235494	validation_1-logloss:0.233884
[30]	validation_0-logloss:0.164446	validation_1-logloss:0.162648
[40]	validation_0-logloss:0.124485	validation_1-logloss:0.12249
[50]	valida

#### 4. Submission

In [27]:
submid = pd.DataFrame({'id': vid})
submission = pd.concat([submid, pd.DataFrame(preds, columns=y_cols)], axis=1)
submission.to_csv('data/submissions/xgb_feeder.csv', index=False)

#### 5. Weighted Avg Submission

In [32]:
sub1 = pd.read_csv('data/submissions/wtd_avg_2.csv')
sub3 = (sub1.as_matrix()[:,1:] * .5) + (submission.as_matrix()[:,1:] * .5)
print(sub3.shape)

(226998, 6)


In [33]:
sub3 = pd.concat([submid, pd.DataFrame(sub3, columns=y_cols)], axis=1)
sub3.to_csv('data/submissions/wtd_avg_4.csv', index=False)