In [1]:
import os
import joblib
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

from tqdm.auto import tqdm
import copy
from colorama import Fore, Style
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline

from helper_functions import *
print("Cpu cores found:", os.cpu_count())

Cpu cores found: 64


In [2]:
train_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/train.csv.zip")
test_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test.csv.zip")
test_labels_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test_labels.csv.zip")
sample_sub_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/sample_submission.csv.zip")

In [3]:
# train_df_full = pd.read_csv(train_fname)
# test_df = pd.read_csv(test_fname)
# test_labels = pd.read_csv(test_labels_fname)

In [4]:
sample_sub_df = pd.read_csv(sample_sub_fname)

In [5]:
comments_classes = list(sample_sub_df.columns[1:])
comments_classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### Conversion to TF-IDF Vectors

In [6]:
def NLP1(text):
    text_tok = word_tokenize(text)
    
    eng_stopwords = stopwords.words('english')
    text_stp = [word for word in text_tok if (word.lower() not in eng_stopwords) and word.isalpha()]
    
    stemmer = SnowballStemmer(language='english')
    text_stm = [stemmer.stem(word) for word in text_stp]
    return text_stm

In [7]:
max_features = 5000 # or None

In [8]:
# %%time
# eng_stopwords = stopwords.words('english')

# vectorizer = TfidfVectorizer(lowercase=True, 
#                                tokenizer=NLP1,
#                                stop_words=eng_stopwords,
#                                ngram_range=(1,2),
#                                max_features=max_features).fit(train_df_full.comment_text)

In [9]:
# train_df, val_df = train_test_split(train_df_full, test_size=0.3, random_state=43)
# train_df.shape, val_df.shape

In [10]:
# x_train_full = vectorizer.transform(train_df.comment_text)
# x_val_full = vectorizer.transform(val_df.comment_text)
# x_test_full = vectorizer.transform(test_df.comment_text)

# y_train_full = train_df[comments_classes].to_numpy()
# y_val_full = val_df[comments_classes].to_numpy()

# x_train_full.shape, y_train_full.shape, x_val_full.shape, y_val_full.shape, x_test_full.shape

In [11]:
# joblib.dump(x_train_full, f'x_train_full_{max_features}.joblib')
# joblib.dump(x_val_full, f'x_val_full_{max_features}.joblib')
# joblib.dump(x_test_full, f'x_test_full_{max_features}.joblib')

# joblib.dump(y_train_full, f'y_train_full_{max_features}.joblib')
# joblib.dump(y_val_full, f'y_val_full_{max_features}.joblib')

In [12]:
x_train_full = joblib.load(f'x_train_full_{max_features}.joblib')
x_val_full = joblib.load(f'x_val_full_{max_features}.joblib')
x_test_full = joblib.load(f'x_test_full_{max_features}.joblib')

y_train_full = joblib.load(f'y_train_full_{max_features}.joblib')
y_val_full = joblib.load(f'y_val_full_{max_features}.joblib')

x_train_full.shape, y_train_full.shape, x_val_full.shape, y_val_full.shape, x_test_full.shape

((111699, 5000), (111699, 6), (47872, 5000), (47872, 6), (153164, 5000))

In [13]:
MODEL_NAME = 'MChain_RF'
SUB_CSV_NAME = f"sub_{MODEL_NAME}.csv"
SUB_CSV_MSG = f"Manual Chain"

cmd1 = f"head {SUB_CSV_NAME}"
cmd2 = f'kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f {SUB_CSV_NAME} -m "{SUB_CSV_MSG} {max_features}"'
print(SUB_CSV_NAME)
print(cmd1)
print(cmd2)

sub_MChain_RF.csv
head sub_MChain_RF.csv
kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f sub_MChain_RF.csv -m "Manual Chain 5000"


In [14]:
%%time

clf = RandomForestClassifier(n_estimators=1000, criterion='gini', max_depth=None, random_state=42,verbose=1, n_jobs=-1)
x_train = lil_matrix(np.concatenate((x_train_full.toarray(), np.zeros_like(y_train_full)), axis=1))
x_val = lil_matrix(np.concatenate((x_val_full.toarray(), np.zeros_like(y_val_full)), axis=1))
x_test = lil_matrix(np.concatenate((x_test_full.toarray(), np.zeros((x_test_full.shape[0],6))), axis=1))
test_probs = []

for i, class_ in enumerate(comments_classes):
    print('*'*20, i, class_, '*'*20)
    
    y_train, y_val = y_train_full[:,i], y_val_full[:,i]
    print("x_train:", x_train.shape)
    print("x_val:", x_val.shape)
    print("x_test:", x_test.shape)
    print("y_train:", y_train.shape)
    print("y_val:", y_val.shape)
    
    print(f'Fitting {MODEL_NAME} model on {class_} column...')
    clf.fit(x_train, y_train)
    
    train_pred = clf.predict(x_train)
    val_pred = clf.predict(x_val)
    test_pred = clf.predict(x_test)
    test_prob = clf.predict_proba(x_test)
    print("Train ROC:", roc_auc_score(y_train,train_pred))
    print("Val ROC:", roc_auc_score(y_val,val_pred))
    print("Train Acc:", accuracy_score(y_train, train_pred))
    print("Val Acc:", accuracy_score(y_val, val_pred))
    
    x_train[:,5000+i] = train_pred
    x_val[:,5000+i] = val_pred
    x_test[:,5000+i] = test_pred
    print('test_prob:', test_prob.shape)
    test_probs.append(test_prob)
    
test_probs = np.array(test_probs).T
print("test_probs:", test_probs.shape)

******************** 0 toxic ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_RF model on toxic column...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   53.5s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.2s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.5s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.9s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | elapsed:    1.3s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.2s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.4s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | ela

Train ROC: 0.9929536710561752
Val ROC: 0.8247974392154022
Train Acc: 0.9984332894654384
Val Acc: 0.9558405748663101
test_prob: (153164, 2)
******************** 1 severe_toxic ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_RF model on severe_toxic column...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   23.1s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.2s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.4s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | elapsed:    0.6s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.0s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.2s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | ela

Train ROC: 0.9736764221888249
Val ROC: 0.5402599463943141
Train Acc: 0.9994180789443057
Val Acc: 0.9897852606951871
test_prob: (153164, 2)
******************** 2 obscene ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_RF model on obscene column...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   33.1s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.3s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.6s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | elapsed:    0.8s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.3s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | ela

Train ROC: 0.9932734027167286
Val ROC: 0.8694819336429046
Train Acc: 0.999059973679263
Val Acc: 0.9791945187165776
test_prob: (153164, 2)
******************** 3 threat ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_RF model on threat column...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   19.9s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.2s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.3s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | elapsed:    0.5s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.0s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | ela

Train ROC: 0.9848439952687993
Val ROC: 0.5201759780902351
Train Acc: 0.9999015210521133
Val Acc: 0.9968457553475936
test_prob: (153164, 2)
******************** 4 insult ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_RF model on insult column...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   37.6s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.3s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.6s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | elapsed:    0.8s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.3s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | ela

Train ROC: 0.9912659233147867
Val ROC: 0.8083237218093879
Train Acc: 0.9986212947295857
Val Acc: 0.9683531082887701
test_prob: (153164, 2)
******************** 5 identity_hate ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_RF model on identity_hate column...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   24.1s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.2s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.4s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | elapsed:    0.6s finished
[Parallel(n_jobs=64)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  72 tasks      | elapsed:    0.0s
[Parallel(n_jobs=64)]: Done 322 tasks      | elapsed:    0.1s
[Parallel(n_jobs=64)]: Done 672 tasks      | elapsed:    0.2s
[Parallel(n_jobs=64)]: Done 1000 out of 1000 | ela

Train ROC: 0.9836730659623435
Val ROC: 0.6268368171555905
Train Acc: 0.9996597999982094
Val Acc: 0.9931066176470589
test_prob: (153164, 2)
test_probs: (2, 153164, 6)
CPU times: user 3h 35min 45s, sys: 10.8 s, total: 3h 35min 56s
Wall time: 3min 46s


[Parallel(n_jobs=64)]: Done 1000 out of 1000 | elapsed:    1.0s finished


In [15]:
test_probs.shape

(2, 153164, 6)

In [16]:
sample_sub_df.loc[:, comments_classes] = test_probs[1]

In [17]:
sample_sub_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.899000,0.262967,0.825667,0.029,0.787500,0.197
1,0000247867823ef7,0.001000,0.000000,0.000000,0.000,0.004000,0.000
2,00013b17ad220c46,0.000000,0.000000,0.000000,0.000,0.000000,0.000
3,00017563c3f7919a,0.001000,0.000000,0.000000,0.000,0.000000,0.001
4,00017695ad8997eb,0.002000,0.000000,0.001000,0.000,0.000000,0.017
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.026000,0.000000,0.006000,0.000,0.000000,0.001
153160,fffd7a9a6eb32c16,0.139717,0.000000,0.003000,0.000,0.005000,0.044
153161,fffda9e8d6fafa9e,0.004000,0.000000,0.001000,0.000,0.000000,0.000
153162,fffe8f1340a79fc2,0.018000,0.000000,0.000000,0.000,0.001000,0.000


In [18]:
sample_sub_df.to_csv(SUB_CSV_NAME, index=False)
subprocess.run(cmd1, shell=True)

id,toxic,severe_toxic,obscene,threat,insult,identity_hate
00001cee341fdb12,0.899,0.2629666666666667,0.8256666666666668,0.029,0.7875,0.197
0000247867823ef7,0.001,0.0,0.0,0.0,0.004,0.0
00013b17ad220c46,0.0,0.0,0.0,0.0,0.0,0.0
00017563c3f7919a,0.001,0.0,0.0,0.0,0.0,0.001
00017695ad8997eb,0.002,0.0,0.001,0.0,0.0,0.017
0001ea8717f6de06,0.007,0.0,0.002,0.0,0.0,0.0
00024115d4cbde0f,0.001,0.0,0.0,0.0,0.0,0.0
000247e83dcc1211,0.178,0.0,0.0020886319845857416,0.0,0.0013371453071819017,1.879327747700691e-05
00025358d4737918,0.1753,0.001,0.001,0.001,0.015,0.0


CompletedProcess(args='head sub_MChain_RF.csv', returncode=0)

In [19]:
subprocess.run(cmd2, shell=True)

100%|██████████| 9.04M/9.04M [00:03<00:00, 2.54MB/s]


Successfully submitted to Toxic Comment Classification Challenge

CompletedProcess(args='kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f sub_MChain_RF.csv -m "Manual Chain 5000"', returncode=0)