In [1]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score,recall_score, roc_auc_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

import re

Using TensorFlow backend.


In [2]:
np.random.seed(100)

In [3]:
data = pd.read_csv('../../Dataset/jigsaw-toxic-comment-classification-challenge/train.csv')
X_test = pd.read_csv('../../Dataset/jigsaw-toxic-comment-classification-challenge/test.csv')
y_test = pd.read_csv('../../Dataset/jigsaw-toxic-comment-classification-challenge/test_labels.csv')

### Take a look at the data

In [4]:
data.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [6]:
combine = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)

In [7]:
print('There are %.2f%% data has labels.' %(sum(combine>0)/data.shape[0]*100))
print('%.2f%% of data has label as toxic.' %(sum(data['toxic']==1)/data.shape[0]*100))
print('%.2f%% of data has label as severe toxic.' %(sum(data['severe_toxic']==1)/data.shape[0]*100))
print('%.2f%% of data has label as obscene.' %(sum(data['obscene']==1)/data.shape[0]*100))
print('%.2f%% of data has label as threat.' %(sum(data['threat']==1)/data.shape[0]*100))
print('%.2f%% of data has label as insult.' %(sum(data['insult']==1)/data.shape[0]*100))
print('%.2f%% of data has label as identity hate.' %(sum(data['identity_hate']==1)/data.shape[0]*100))

There are 10.17% data has labels.
9.58% of data has label as toxic.
1.00% of data has label as severe toxic.
5.29% of data has label as obscene.
0.30% of data has label as threat.
4.94% of data has label as insult.
0.88% of data has label as identity hate.


### Imbalanced Classification Problem
#### Start with one label, treat it as a binary classification problem to test some ideas
1. try models with normal data
2. try models with downsample data
3. try models with upsample data

In [4]:
X_train = data['comment_text']
y_train = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

### Feature Engineer
- going to try tokenize, remove un-English words, then stem
- go with tfidf features

In [5]:
# Stem
snowballStemmer = SnowballStemmer('english')

In [6]:
# Build a function to combine 3 into 1
def token_stem(text):
    word = [w for sent in sent_tokenize(text) for w in word_tokenize(sent)]
    filtered = [filt for filt in word if re.search('[A-Za-z]', filt)]
    stemmed = [snowballStemmer.stem(word) for word in filtered]
    
    return stemmed

### Tfidf

In [7]:
# Deal with stopwords warnings
stop_w_list = stopwords.words('english')

In [8]:
preprocess_stop = [snowballStemmer.stem(word) for word in stop_w_list]
preprocess_stop = preprocess_stop + ["'d", 'could', 'might', 'must', "n't", 'need', 'sha', 'wo', 'would']

In [9]:
# Should not use lower case cuz toxic comment seems like having many upper case words
# Toxic words seems combine into 2 together, so use bi-gram
# Consider imbalanced labels, min_df should not be too small
# toxic comment is not that comman so max_df should not be too large
# toxic word should be so many. There are really just a few words can be used toxically!!!
# max_features should not be too large (also avoid curse of dimenality)
tfidf = TfidfVectorizer(max_df=0.7, max_features=5000, lowercase=False,
                                 min_df=10, stop_words=preprocess_stop,
                                 use_idf=True, tokenizer=token_stem,
                                 ngram_range=(1,2))

In [10]:
tfidf_features = tfidf.fit_transform(X_train)

  'stop_words.' % sorted(inconsistent))


### Baseline Model
- logistic regression, SVM, and Multinomial NB
- Try them with parameter tuning but no sampling method first

In [170]:
def tune_and_report(classifier, param_grid, X, y):
    gsc = GridSearchCV(estimator=classifier, param_grid=param_grid, 
                       scoring='f1', n_jobs=-1, cv=5, verbose=2)
    gsc.fit(X, y)
    
    return [f1_score(y, gsc.best_estimator_.predict(X)),
            recall_score(y, gsc.best_estimator_.predict(X)),
           gsc.best_estimator_]

In [183]:
lr_param = {'penalty': ['l1', 'l2'],
           'C': [0.01, 0.1, 1, 5],
           'class_weight': [None, 'balanced']}

In [185]:
lr_result = tune_and_report(LogisticRegression(random_state=100), lr_param, tfidf_features, y_train['toxic'])

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   13.7s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [159]:
nb_param = {'alpha': [1.0, 0.01]}

In [172]:
nb_result = tune_and_report(MultinomialNB(), nb_param, tfidf_features, y_train['toxic'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [188]:
svm_param = {'C': [0.01, 0.1, 1, 5],
            'penalty': ['l1', 'l2'],
            'class_weight': [None, 'balaned']}

In [189]:
svm_result = tune_and_report(LinearSVC(random_state=100), svm_param, tfidf_features, y_train['toxic'])

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   10.0s finished


In [190]:
result = pd.DataFrame([lr_result[:2], nb_result[:2], svm_result[:2]])
result.index = ['Logistic Regression', 'Multinomial NB', 'SVC']
result.columns = ['f1', 'recall']

In [191]:
result

Unnamed: 0,f1,recall
Logistic Regression,0.78452,0.697855
Multinomial NB,0.689611,0.56205
SVC,0.783696,0.69269


- Logistic Regression is winner.
- Class weight are both None selected by grid search for Linear Regression and SVC: means normal balanced doesn't improve performance.
- What about ensemble method? It may take longer to train.

### Try better model

In [205]:
rf_param = {"n_estimators": [10, 50, 100, 200],
             "max_depth": [2,4,8,15],
             "max_features": ['auto', 'sqrt']}

In [206]:
rf_result = tune_and_report(RandomForestClassifier(random_state=100), rf_param, tfidf_features, y_train['toxic'])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  3.1min finished


In [226]:
result = result.append({'f1': rf_result[0], 'recall': rf_result[1]}, ignore_index=True)
result.index = ['Logistic Regression', 'Multinomial NB', 'SVC', 'rf']
result

Unnamed: 0,f1,recall
Logistic Regression,0.78452,0.697855
Multinomial NB,0.689611,0.56205
SVC,0.783696,0.69269
rf,0.170869,0.093501


### Try SMOTE oversampling method to see if we can improve our performance
- Random Forest performs worse which makes sense because random forest is not good at nlp types of tasks (especially with subsample of features)
- Let us try better sample method to see if performance can be improved.

In [234]:
weights = np.linspace(0.1, 0.90, 10)

param_grid = {"smote__sampling_strategy": weights,
             "smote__k_neighbors": [3,5,8]}

In [236]:
potential_models = [lr_result[2], svm_result[2]]

In [261]:
holder = []
model = []
for m in potential_models:
    os_model = make_pipeline(SMOTE(), m)

    os_result = tune_and_report(os_model, param_grid, tfidf_features, y_train['toxic'])
    holder.append(os_result[:2])
    model.append(os_result[2])

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.2min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.1min finished


In [262]:
li = [lr_result[:2], nb_result[:2], svm_result[:2]]
li.extend(holder)
result = pd.DataFrame(li)
result.index = ['Logistic Regression', 'Multinomial NB', 'SVC', 'Logistic Regression US', 'SVC US']
result.columns = ['f1', 'recall']

In [263]:
result

Unnamed: 0,f1,recall
Logistic Regression,0.78452,0.697855
Multinomial NB,0.689611,0.56205
SVC,0.783696,0.69269
Logistic Regression US,0.796291,0.757944
SVC US,0.798436,0.754479


#### Upsampling Result
- SMOTE upsamling method works
- SVC performance is better than Logistic Regression

### Extend it to multi-class classificaiton
- use provided labels to evaluate models
- need to eliminate -1 in provided labels to get actual labels
- Documentation says they use average AUC_ROC so we will include AUC_ROC here
- We need to param tunes for each labels

In [20]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate']

In [21]:
y_test.head(2)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1


In [22]:
true_y_test = y_test[y_test['toxic'] != -1]
true_X_test = X_test[X_test['id'].isin(true_y_test['id'])]
true_y_test_no_id = true_y_test.drop('id', axis=1)

In [23]:
test_tfidf_features = tfidf.transform(true_X_test['comment_text'])

### Logistic Regression Tune and Train Approach

In [316]:
def train_one_label(classifier, param_grid, X_train, y_train, X_test, y_test):
    gsc = GridSearchCV(estimator=classifier, param_grid=param_grid, 
                       scoring='roc_auc', n_jobs=2, cv=5, verbose=2)
    gsc.fit(X_train, y_train)
    
    return roc_auc_score(y_test, gsc.predict(X_test))

In [317]:
pipe = make_pipeline(SMOTE(random_state=100), LogisticRegression(random_state=100))

In [319]:
weights = [0.3, 0.7, 0.9]##np.linspace(0.1, 0.90, 10)

param_grid = {"smote__sampling_strategy": weights,
             "smote__k_neighbors": [3,5,8],
             "logisticregression__C": [0.01, 0.1, 1, 5]}

In [320]:
lr_auc_list = [train_one_label(pipe, param_grid, tfidf_features, y_train[label], test_tfidf_features, true_y_test[label]) for label in labels]

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  7.4min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  8.7min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   19.1s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  2.7min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  3.3min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   41.8s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  4.1min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  4.9min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   19.9s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  3.4min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   39.3s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  3.8min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  4.6min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   17.1s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  2.7min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  3.3min finished


In [333]:
print("Logistic Regression average AUC for all labels are %.2f" %(np.array(lr_auc_list).mean()))

Logistic Regression average AUC for all labels are 0.87


### Support Vector Machine Tune and Train Approach

In [336]:
pipe = make_pipeline(SMOTE(random_state=100), LinearSVC(random_state=100))

In [338]:
param_grid = {"smote__sampling_strategy": weights,
             "smote__k_neighbors": [3,5,8],
             "linearsvc__C": [0.01, 0.1, 1, 5]}

In [341]:
svc_auc_list = \
    [train_one_label(pipe, param_grid, tfidf_features, y_train[label], test_tfidf_features, true_y_test[label]) for label in labels]    
    

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  1.7min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  8.3min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed: 10.3min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   17.8s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  3.2min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   39.5s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  4.2min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  5.6min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   16.7s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  1.7min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  2.4min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   36.7s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  4.2min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  5.8min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   18.6s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  3.7min finished


In [342]:
print("Support Vector Classifier average AUC for all labels are %.2f" %(np.array(svc_auc_list).mean()))

Support Vector Classifier average AUC for all labels are 0.89


### SVC with SMOTE upsampling outperform Logistic Regression.
- now let us try more complex models
- use Feed Forward Neron Network and treat the problem as a multilablel problem

In [11]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

In [12]:
def create_model(learning_rate=0.01, activation='relu'):
    
    # Create an Adam optimizer with the given learning rate
    opt = Adam(lr=learning_rate)

    # Create your binary classification model  
    model = Sequential()
    model.add(Dense(8000, input_shape=(5000,), activation=activation, kernel_initializer='truncated_normal'))
    model.add(BatchNormalization())
    model.add(Dense(1000, activation=activation))
    model.add(BatchNormalization())
    model.add(Dense(6, activation='sigmoid'))

    # Compile your model with your optimizer, loss, and metrics
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[AUC()])
    return model

#### Parameter tunings took so long with limited sources I have. Pause for now.

In [13]:
# model = KerasClassifier(build_fn=create_model)

# params = {'activation': ['relu', 'tanh'], 'batch_size': [100, 500, 1000, 10000], 
#           'epochs': [20, 50, 100, 200], 'learning_rate': [0.1, 0.01, 0.001]}

# # Create a randomize search cv object passing in the parameters to try
# random_search = RandomizedSearchCV(model, param_distributions = params, cv = 3, 
#                                    n_iter=10, n_jobs=4, verbose=2, random_state=100)

# random_search.fit(tfidf_features, y_train)

In [15]:
model = create_model()

In [17]:
monitor_metric = EarlyStopping(monitor='val_auc', patience=5)

In [18]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8000)              40008000  
_________________________________________________________________
batch_normalization_1 (Batch (None, 8000)              32000     
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              8001000   
_________________________________________________________________
batch_normalization_2 (Batch (None, 1000)              4000      
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 6006      
Total params: 48,051,006
Trainable params: 48,033,006
Non-trainable params: 18,000
_________________________________________________________________


In [24]:
history = model.fit(tfidf_features, y_train, epochs=100, batch_size=500,validation_data=(test_tfidf_features, true_y_test_no_id),
         callbacks=[monitor_metric])

Train on 159571 samples, validate on 63978 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


- if using 2 layers MLN, we can achieve a test average AUC as 0.9764 which is significantly higher than both logistic regression and linear SVC
- Let us try to submite the result to Kaggle.

In [28]:
submission_features = tfidf.transform(X_test['comment_text'])

In [29]:
submit_result = pd.DataFrame(model.predict_proba(submission_features), columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [33]:
submit_result = pd.concat([X_test['id'],submit_result], axis=1)

In [34]:
submit_result.to_csv('../../Dataset/jigsaw-toxic-comment-classification-challenge/submission_result.csv', index=False)

### Kaggle Submission Result
- public socre: 0.95522, private score: 0.95612
### What we can do to improve?
- Try tune parameters for MLN when I have more computing resources
- Try RNN with LSTM models since RNN model suits language scenario better than MLN