In [1]:
import os
import joblib
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

from tqdm.auto import tqdm
import copy
from colorama import Fore, Style
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline

from helper_functions import *
print("Cpu cores found:", os.cpu_count())

Cpu cores found: 64


In [2]:
train_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/train.csv.zip")
test_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test.csv.zip")
test_labels_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test_labels.csv.zip")
sample_sub_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/sample_submission.csv.zip")

In [3]:
# train_df_full = pd.read_csv(train_fname)

In [4]:
# comments_classes = list(train_df_full.columns[2:])

In [5]:
# test_df = pd.read_csv(test_fname)

In [6]:
# test_labels = pd.read_csv(test_labels_fname)

In [7]:
sample_sub_df = pd.read_csv(sample_sub_fname)

In [8]:
comments_classes = list(sample_sub_df.columns[1:])
comments_classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### Conversion to TF-IDF Vectors

In [9]:
def NLP1(text):
    text_tok = word_tokenize(text)
    
    eng_stopwords = stopwords.words('english')
    text_stp = [word for word in text_tok if (word.lower() not in eng_stopwords) and word.isalpha()]
    
    stemmer = SnowballStemmer(language='english')
    text_stm = [stemmer.stem(word) for word in text_stp]
    return text_stm

In [10]:
max_features = 5000 # or None

In [11]:
# %%time
# eng_stopwords = stopwords.words('english')

# vectorizer = TfidfVectorizer(lowercase=True, 
#                                tokenizer=NLP1,
#                                stop_words=eng_stopwords,
#                                ngram_range=(1,2),
#                                max_features=max_features).fit(train_df_full.comment_text)

In [12]:
# train_df, val_df = train_test_split(train_df_full, test_size=0.3, random_state=43)
# train_df.shape, val_df.shape

In [13]:
# x_train_full = vectorizer.transform(train_df.comment_text)
# x_val_full = vectorizer.transform(val_df.comment_text)
# x_test_full = vectorizer.transform(test_df.comment_text)

# y_train_full = train_df[comments_classes].to_numpy()
# y_val_full = val_df[comments_classes].to_numpy()

# x_train_full.shape, y_train_full.shape, x_val_full.shape, y_val_full.shape, x_test_full.shape

In [14]:
# joblib.dump(x_train_full, f'x_train_full_{max_features}.joblib')
# joblib.dump(x_val_full, f'x_val_full_{max_features}.joblib')
# joblib.dump(x_test_full, f'x_test_full_{max_features}.joblib')

# joblib.dump(y_train_full, f'y_train_full_{max_features}.joblib')
# joblib.dump(y_val_full, f'y_val_full_{max_features}.joblib')

In [15]:
x_train_full = joblib.load(f'x_train_full_{max_features}.joblib')
x_val_full = joblib.load(f'x_val_full_{max_features}.joblib')
x_test_full = joblib.load(f'x_test_full_{max_features}.joblib')

y_train_full = joblib.load(f'y_train_full_{max_features}.joblib')
y_val_full = joblib.load(f'y_val_full_{max_features}.joblib')

x_train_full.shape, y_train_full.shape, x_val_full.shape, y_val_full.shape, x_test_full.shape

((111699, 5000), (111699, 6), (47872, 5000), (47872, 6), (153164, 5000))

In [16]:
MODEL_NAME = 'MChain_L2LR'
SUB_CSV_NAME = f"sub_{MODEL_NAME}.csv"
SUB_CSV_MSG = f"Manual Chain"

cmd1 = f"head {SUB_CSV_NAME}"
cmd2 = f'kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f {SUB_CSV_NAME} -m "{SUB_CSV_MSG} {max_features}"'
print(SUB_CSV_NAME)
print(cmd1)
print(cmd2)

sub_MChain_L2LR.csv
head sub_MChain_L2LR.csv
kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f sub_MChain_L2LR.csv -m "Manual Chain 5000"


In [17]:
%%time

clf = LogisticRegression(solver='lbfgs', penalty='l2', max_iter=1000, random_state=42, n_jobs=-1, tol=1e-5)
x_train = lil_matrix(np.concatenate((x_train_full.toarray(), np.zeros_like(y_train_full)), axis=1))
x_val = lil_matrix(np.concatenate((x_val_full.toarray(), np.zeros_like(y_val_full)), axis=1))
x_test = lil_matrix(np.concatenate((x_test_full.toarray(), np.zeros((x_test_full.shape[0],6))), axis=1))
test_probs = []

for i, class_ in enumerate(comments_classes):
    print('*'*20, i, class_, '*'*20)
    
    y_train, y_val = y_train_full[:,i], y_val_full[:,i]
    print("x_train:", x_train.shape)
    print("x_val:", x_val.shape)
    print("x_test:", x_test.shape)
    print("y_train:", y_train.shape)
    print("y_val:", y_val.shape)
    
    print(f'Fitting {MODEL_NAME} model on {class_} column...')
    clf.fit(x_train, y_train)
    
    train_pred = clf.predict(x_train)
    val_pred = clf.predict(x_val)
    test_pred = clf.predict(x_test)
    test_prob = clf.predict_proba(x_test)
    print("Train ROC:", roc_auc_score(y_train,train_pred))
    print("Val ROC:", roc_auc_score(y_val,val_pred))
    print("Train Acc:", accuracy_score(y_train, train_pred))
    print("Val Acc:", accuracy_score(y_val, val_pred))
    
    x_train[:,5000+i] = train_pred
    x_val[:,5000+i] = val_pred
    x_test[:,5000+i] = test_pred
    print('test_prob:', test_prob.shape)
    test_probs.append(test_prob)
    
test_probs = np.array(test_probs).T
print("test_probs:", test_probs.shape)

******************** 0 toxic ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_L2LR model on toxic column...
Train ROC: 0.8184440033650753
Val ROC: 0.8067855065493116
Train Acc: 0.9605099418974208
Val Acc: 0.9576161430481284
test_prob: (153164, 2)
******************** 1 severe_toxic ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_L2LR model on severe_toxic column...
Train ROC: 0.5950198187701174
Val ROC: 0.5764337461863432
Train Acc: 0.9907698367935255
Val Acc: 0.9897225935828877
test_prob: (153164, 2)
******************** 2 obscene ********************
x_train: (111699, 5006)
x_val: (47872, 5006)
x_test: (153164, 5006)
y_train: (111699,)
y_val: (47872,)
Fitting MChain_L2LR model on obscene column...
Train ROC: 0.8733696320033274
Val ROC: 0.8534946685325493
Train Acc: 0.9822469314855101
Val Acc: 0.97919451871

In [18]:
test_probs.shape

(2, 153164, 6)

In [23]:
test_probs[0], test_probs[1]

(array([[6.52235094e-04, 7.60889330e-01, 6.31458540e-03, 9.25630127e-01,
         5.16348652e-02, 4.92169502e-01],
        [9.90420040e-01, 9.99370873e-01, 9.95339076e-01, 9.99169203e-01,
         9.89349958e-01, 9.98110980e-01],
        [9.88310292e-01, 9.99621989e-01, 9.96020340e-01, 9.99633512e-01,
         9.94997966e-01, 9.99124884e-01],
        ...,
        [9.94689949e-01, 9.99457547e-01, 9.94678861e-01, 9.99337991e-01,
         9.95330716e-01, 9.98323725e-01],
        [9.71533845e-01, 9.99273166e-01, 9.85540447e-01, 9.99057748e-01,
         9.85296010e-01, 9.91818335e-01],
        [9.00463440e-02, 9.90471487e-01, 3.25265716e-01, 9.82035372e-01,
         4.75345891e-01, 9.56291268e-01]]),
 array([[9.99347765e-01, 2.39110670e-01, 9.93685415e-01, 7.43698726e-02,
         9.48365135e-01, 5.07830498e-01],
        [9.57995953e-03, 6.29126769e-04, 4.66092373e-03, 8.30796639e-04,
         1.06500421e-02, 1.88901972e-03],
        [1.16897081e-02, 3.78011017e-04, 3.97965965e-03, 3.664879

In [24]:
sample_sub_df.loc[:, comments_classes] = test_probs[1]

In [25]:
sample_sub_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999348,0.239111,0.993685,0.074370,0.948365,0.507830
1,0000247867823ef7,0.009580,0.000629,0.004661,0.000831,0.010650,0.001889
2,00013b17ad220c46,0.011690,0.000378,0.003980,0.000366,0.005002,0.000875
3,00017563c3f7919a,0.008143,0.000679,0.003350,0.000682,0.004379,0.000835
4,00017695ad8997eb,0.035249,0.000491,0.011298,0.000491,0.014144,0.001389
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.060850,0.000297,0.005684,0.000562,0.019215,0.002302
153160,fffd7a9a6eb32c16,0.157200,0.000660,0.011682,0.002037,0.023762,0.003145
153161,fffda9e8d6fafa9e,0.005310,0.000542,0.005321,0.000662,0.004669,0.001676
153162,fffe8f1340a79fc2,0.028466,0.000727,0.014460,0.000942,0.014704,0.008182


In [26]:
sample_sub_df.to_csv(SUB_CSV_NAME, index=False)
subprocess.run(cmd1, shell=True)

id,toxic,severe_toxic,obscene,threat,insult,identity_hate
00001cee341fdb12,0.9993477649055471,0.23911066978059017,0.9936854145983333,0.07436987263907294,0.9483651347531701,0.5078304983296601
0000247867823ef7,0.009579959534427087,0.0006291267693257473,0.004660923727601092,0.0008307966386802035,0.010650042098591137,0.0018890197173846706
00013b17ad220c46,0.011689708145305363,0.00037801101715612226,0.003979659645261072,0.00036648794071602914,0.0050020344758732755,0.0008751164614980881
00017563c3f7919a,0.008142734293110354,0.000678927375874587,0.003350131282855457,0.000682414180625841,0.004379246417651049,0.000834902797665698
00017695ad8997eb,0.03524865263551584,0.0004914661351086814,0.011297527974774343,0.0004911124190749183,0.014144131959514708,0.0013885117638588083
0001ea8717f6de06,0.00630672997374889,0.0002873472798044377,0.0023388584451227647,0.00046071744575829754,0.004605945667397367,0.000787349648005488
00024115d4cbde0f,0.008862859579603382,0.00022011128751434222,0.00572691112334475

CompletedProcess(args='head sub_MChain_L2LR.csv', returncode=0)

In [27]:
subprocess.run(cmd2, shell=True)

100%|██████████| 21.0M/21.0M [00:09<00:00, 2.30MB/s]  


Successfully submitted to Toxic Comment Classification Challenge

CompletedProcess(args='kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f sub_MChain_L2LR.csv -m "Manual Chain 5000"', returncode=0)