In [1]:
import os
import joblib
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

from tqdm.auto import tqdm
from timeit import default_timer as timer
import copy
from colorama import Fore, Style
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline

from helper_functions import *
print("Cpu cores found:", os.cpu_count())

Cpu cores found: 64


In [2]:
train_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/train.csv.zip")
test_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test.csv.zip")
test_labels_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test_labels.csv.zip")
sample_sub_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/sample_submission.csv.zip")

In [3]:
# train_df_full = pd.read_csv(train_fname)
# test_df = pd.read_csv(test_fname)
# test_labels = pd.read_csv(test_labels_fname)

In [4]:
sample_sub_df = pd.read_csv(sample_sub_fname)

In [5]:
comments_classes = list(sample_sub_df.columns[1:])
comments_classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### Conversion to TF-IDF Vectors

In [6]:
def NLP1(text):
    text_tok = word_tokenize(text)
    
    eng_stopwords = stopwords.words('english')
    text_stp = [word for word in text_tok if (word.lower() not in eng_stopwords) and word.isalpha()]
    
    stemmer = SnowballStemmer(language='english')
    text_stm = [stemmer.stem(word) for word in text_stp]
    return text_stm

In [7]:
max_features = 5000 # or None

In [8]:
# %%time
# eng_stopwords = stopwords.words('english')

# vectorizer = TfidfVectorizer(lowercase=True, 
#                                tokenizer=NLP1,
#                                stop_words=eng_stopwords,
#                                ngram_range=(1,2),
#                                max_features=max_features).fit(train_df_full.comment_text)

In [9]:
# train_df, val_df = train_test_split(train_df_full, test_size=0.3, random_state=43)
# train_df.shape, val_df.shape

In [10]:
# x_train_full = vectorizer.transform(train_df.comment_text)
# x_val_full = vectorizer.transform(val_df.comment_text)
# x_test_full = vectorizer.transform(test_df.comment_text)

# y_train_full = train_df[comments_classes].to_numpy()
# y_val_full = val_df[comments_classes].to_numpy()

# x_train_full.shape, y_train_full.shape, x_val_full.shape, y_val_full.shape, x_test_full.shape

In [11]:
# joblib.dump(x_train_full, f'x_train_full_{max_features}.joblib')
# joblib.dump(x_val_full, f'x_val_full_{max_features}.joblib')
# joblib.dump(x_test_full, f'x_test_full_{max_features}.joblib')

# joblib.dump(y_train_full, f'y_train_full_{max_features}.joblib')
# joblib.dump(y_val_full, f'y_val_full_{max_features}.joblib')

In [12]:
x_train_full = joblib.load(f'x_train_full_{max_features}.joblib')
x_val_full = joblib.load(f'x_val_full_{max_features}.joblib')
x_test_full = joblib.load(f'x_test_full_{max_features}.joblib')


((111699, 5000), (111699, 6), (47872, 5000), (47872, 6), (153164, 5000))

In [None]:

y_train_full = joblib.load(f'y_train_full_{max_features}.joblib')
y_val_full = joblib.load(f'y_val_full_{max_features}.joblib')

x_train_full.shape, y_train_full.shape, x_val_full.shape, y_val_full.shape, x_test_full.shape

In [14]:
MODEL_NAME = 'MChain_MLP'
SUB_CSV_NAME = f"sub_{MODEL_NAME}.csv"
SUB_CSV_MSG = f"Manual Chain"

cmd1 = f"head {SUB_CSV_NAME}"
cmd2 = f'kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f {SUB_CSV_NAME} -m "{SUB_CSV_MSG} {max_features}"'
print(SUB_CSV_NAME)
print(cmd1)
print(cmd2)

sub_MChain_MLP.csv
head sub_MChain_MLP.csv
kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f sub_MChain_MLP.csv -m "Manual Chain 5000"


In [15]:
%%time

clf = MLPClassifier(hidden_layer_sizes=(512,256,128), activation="relu", 
                    solver="adam", alpha = 0.001, batch_size=128, learning_rate = "constant",
                    max_iter=1000, early_stopping=True,
                    random_state=42, verbose=1)
x_train = lil_matrix(np.concatenate((x_train_full.toarray(), np.zeros_like(y_train_full)), axis=1))
x_val = lil_matrix(np.concatenate((x_val_full.toarray(), np.zeros_like(y_val_full)), axis=1))
x_test = lil_matrix(np.concatenate((x_test_full.toarray(), np.zeros((x_test_full.shape[0],6))), axis=1))
test_probs = []

for i, class_ in enumerate(comments_classes):
    print('*'*20, i, class_, '*'*20)
    
    y_train, y_val = y_train_full[:,i], y_val_full[:,i]
    print("x_train:", x_train.shape)
    print("x_val:", x_val.shape)
    print("x_test:", x_test.shape)
    print("y_train:", y_train.shape)
    print("y_val:", y_val.shape)
    
    t = timer()
    print(f'Fitting {MODEL_NAME} model on {class_} column...')
    clf.fit(x_train, y_train)
    print(format_time(t,timer()))
    
    train_pred = clf.predict(x_train)
    val_pred = clf.predict(x_val)
    test_pred = clf.predict(x_test)
    test_prob = clf.predict_proba(x_test)
    print("Train ROC:", roc_auc_score(y_train,train_pred))
    print("Val ROC:", roc_auc_score(y_val,val_pred))
    print("Train Acc:", accuracy_score(y_train, train_pred))
    print("Val Acc:", accuracy_score(y_val, val_pred))
    
    x_train[:,5000+i] = train_pred
    x_val[:,5000+i] = val_pred
    x_test[:,5000+i] = test_pred
    print('test_prob:', test_prob.shape)
    test_probs.append(test_prob)
    
test_probs = np.array(test_probs).T
print("test_probs:", test_probs.shape)

******************** 0 toxic ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_MLP model on toxic column...
Iteration 1, loss = 0.15818402
Validation score: 0.956312
Iteration 2, loss = 0.10820200
Validation score: 0.957207
Iteration 3, loss = 0.07875170
Validation score: 0.956222
Iteration 4, loss = 0.03986062
Validation score: 0.951567
Iteration 5, loss = 0.02606242
Validation score: 0.953894
Iteration 6, loss = 0.02157589
Validation score: 0.954432
Iteration 7, loss = 0.01986923
Validation score: 0.955416
Iteration 8, loss = 0.02174548
Validation score: 0.948702
Iteration 9, loss = 0.02081444
Validation score: 0.952462
Iteration 10, loss = 0.01952162
Validation score: 0.948523
Iteration 11, loss = 0.01782561
Validation score: 0.948791
Iteration 12, loss = 0.01731503
Validation score: 0.953357
Iteration 13, loss = 0.01777675
Validation score: 0.951388
Validation score did not improve more than to

In [16]:
test_probs.shape

(2, 153164, 6)

In [17]:
sample_sub_df.loc[:, comments_classes] = test_probs[1]

In [18]:
sample_sub_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999445,1.890233e-01,0.926037,6.843566e-02,0.925951,0.195273
1,0000247867823ef7,0.000104,5.830921e-06,0.000912,3.837582e-05,0.002805,0.000051
2,00013b17ad220c46,0.011046,2.265731e-06,0.004048,4.140574e-07,0.003013,0.000186
3,00017563c3f7919a,0.001394,2.418638e-06,0.001132,1.916480e-06,0.000241,0.000011
4,00017695ad8997eb,0.019343,1.686708e-06,0.019196,2.803807e-05,0.011620,0.000133
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.026581,1.321449e-07,0.010192,9.866458e-07,0.002290,0.000011
153160,fffd7a9a6eb32c16,0.148956,3.218708e-06,0.010016,1.522563e-03,0.007031,0.000845
153161,fffda9e8d6fafa9e,0.000074,4.919868e-06,0.002450,5.017074e-06,0.000169,0.000091
153162,fffe8f1340a79fc2,0.002958,2.472550e-06,0.008415,5.393663e-05,0.002281,0.002143


In [19]:
sample_sub_df.to_csv(SUB_CSV_NAME, index=False)
subprocess.run(cmd1, shell=True)

id,toxic,severe_toxic,obscene,threat,insult,identity_hate
00001cee341fdb12,0.9994453322563468,0.18902329686979952,0.9260369917257686,0.06843565781470484,0.9259510193429243,0.1952726405140076
0000247867823ef7,0.00010376301991249772,5.830921002787536e-06,0.0009122053529318921,3.8375817338616084e-05,0.0028054953264986704,5.1018790807783026e-05
00013b17ad220c46,0.011045855700524682,2.2657309115743063e-06,0.0040477238674169035,4.140574429288222e-07,0.0030133194566428012,0.0001855103048161947
00017563c3f7919a,0.0013935085373870495,2.418637613059557e-06,0.0011317354367468948,1.9164796559192376e-06,0.00024061190759940834,1.1087338553423825e-05
00017695ad8997eb,0.01934322818588369,1.6867080823994408e-06,0.019195520387795605,2.8038070393344008e-05,0.011620431396047388,0.00013318998337354918
0001ea8717f6de06,0.0005090831743457273,3.4166044337475747e-07,0.0019799700070688613,1.1219290212100511e-05,0.002371109944795833,1.4832475165249281e-05
00024115d4cbde0f,0.0009483229672622675,3.6385004154584955

CompletedProcess(args='head sub_MChain_MLP.csv', returncode=0)

In [20]:
subprocess.run(cmd2, shell=True)

100%|██████████| 21.3M/21.3M [00:14<00:00, 1.57MB/s]  


Successfully submitted to Toxic Comment Classification Challenge

CompletedProcess(args='kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f sub_MChain_MLP.csv -m "Manual Chain 5000"', returncode=0)