# Natural Language Processing with RNNs - Toxic Comment Classification

Outline:

1. Download and explore the data
2. Prepare the data for training
3. Build a recurrent neural network
4. Train & evaluate the model
5. Make predictions & submit to Kaggle

In [1]:
import os
import joblib
from pathlib import Path
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

from tqdm.auto import tqdm
import copy
from colorama import Fore, Style
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline

import xgboost as xgb
from scipy.stats import mode

# import gdown; gdown.download(id="15Gm26M6_tmM7A9HNEqgLXeOS_KUKyXJv")
from helper_functions import *

device = "cuda" if torch.cuda.is_available() else 'cpu'
print("device:", device)
print("Cpu cores found:", os.cpu_count())
print(f"CUDA cores found: {get_cuda_cores()}")

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from colorama import Fore, Style

from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report, roc_auc_score
import optuna

device: cuda
Cpu cores found: 64
CUDA cores found: 4096


## Download & Explore Data

Outline:
1. Download the data from Kaggle
2. Load data into Pandas dataframes
3. Explore the dataset

In [2]:
# def download_Kaggle_data(cmd: str, update:bool=False):
#     data_dir = Path("/home/23m1521/datasets")
#     print(data_dir)
    
#     cmd = f"{cmd} -p {data_dir}"
#     print(cmd)
    
#     name = cmd.split(" ")[4:][0]
#     print(name)
    
#     print(data_dir/name)
#     print(os.path.exists(data_dir/name))
#     if not os.path.exists(data_dir/name):
#         print("done!")
#     elif update:
#         subprocess.run(cmd, shell=True)
#         print("updated!")
    
    
    
# download_Kaggle_data("kaggle competitions download -c jigsaw-toxic-comment-classification-challenge")

In [3]:
train_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/train.csv.zip")
test_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test.csv.zip")
test_labels_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test_labels.csv.zip")
sample_sub_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/sample_submission.csv.zip")

In [4]:
# %load_ext cudf.pandas
import pandas as pd
train_df_full = pd.read_csv(train_fname)
train_df_full

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [5]:
train_df_full

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [6]:
comments_classes = list(train_df_full.columns[2:])
comments_classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
test_df = pd.read_csv(test_fname)
test_df

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [8]:
test_labels = pd.read_csv(test_labels_fname)
test_labels

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,-1,-1,-1,-1,-1,-1
153160,fffd7a9a6eb32c16,-1,-1,-1,-1,-1,-1
153161,fffda9e8d6fafa9e,-1,-1,-1,-1,-1,-1
153162,fffe8f1340a79fc2,-1,-1,-1,-1,-1,-1


In [9]:
sample_sub_df = pd.read_csv(sample_sub_fname)
sample_sub_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.5,0.5,0.5,0.5,0.5,0.5
153160,fffd7a9a6eb32c16,0.5,0.5,0.5,0.5,0.5,0.5
153161,fffda9e8d6fafa9e,0.5,0.5,0.5,0.5,0.5,0.5
153162,fffe8f1340a79fc2,0.5,0.5,0.5,0.5,0.5,0.5


### Create Training & Validation Sets

- Define a custom Pytorch Dataset
- Pass raw data into the dataset
- Split the PyTorch Dataset

In [10]:
train_df_full

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


### Conversion to TF-IDF Vectors

In [11]:
def NLP1(text):
    text_tok = word_tokenize(text)
    
    eng_stopwords = stopwords.words('english')
    text_stp = [word for word in text_tok if (word.lower() not in eng_stopwords) and word.isalpha()]
    
    stemmer = SnowballStemmer(language='english')
    text_stm = [stemmer.stem(word) for word in text_stp]
    return text_stm

In [12]:
max_features = 5000 # or None

In [13]:
# %%time
# eng_stopwords = stopwords.words('english')

# vectorizer = TfidfVectorizer(lowercase=True, 
#                              tokenizer=NLP1,
#                              stop_words=eng_stopwords,
#                              ngram_range=(1,2),
#                              max_features=max_features).fit(train_df_full.comment_text)

### Split training and validation set

In [14]:
total_val_fold_auc = []
test_predictions = []

In [15]:
comments_classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [16]:
train_df, val_df = train_test_split(train_df_full, test_size=0.3, random_state=43)
train_df.shape, val_df.shape

((111699, 8), (47872, 8))

In [17]:
# x_train = vectorizer.transform(train_df.comment_text)
# x_val = vectorizer.transform(val_df.comment_text)
# x_test = vectorizer.transform(test_df.comment_text)
# y_train = train_df[comments_classes].to_numpy()
# y_val = val_df[comments_classes].to_numpy()

# x_train.shape, y_train.shape, x_val.shape

In [18]:
# joblib.dump(x_train, f'x_train_{max_features}.joblib')
# joblib.dump(x_val, f'x_val_{max_features}.joblib')
# joblib.dump(x_test, f'x_test_{max_features}.joblib')
# joblib.dump(y_train, f'y_train_{max_features}.joblib')
# joblib.dump(y_val, f'y_val_{max_features}.joblib')

In [19]:
x_train = joblib.load(f'x_train_{max_features}.joblib')
x_val = joblib.load(f'x_val_{max_features}.joblib')
x_test = joblib.load(f'x_test_{max_features}.joblib')
y_train = joblib.load(f'y_train_{max_features}.joblib')
y_val = joblib.load(f'y_val_{max_features}.joblib')

x_train.shape, y_train.shape, x_val.shape

((111699, 5000), (111699, 6), (47872, 5000))

In [20]:
np.random.seed(42)
n_samples = 50_000
sample_idx = np.random.randint(0,x_train.shape[0], n_samples)
x_train_subset = x_train[sample_idx,:]
y_train_subset = y_train[sample_idx,:]
x_train_subset.shape, y_train_subset.shape

((50000, 5000), (50000, 6))

In [21]:
# dtrain = xgb.DMatrix(x_train_subset, label=y_train_subset)
# dval = xgb.DMatrix(x_val, label=y_val)

In [22]:
y_train.shape[1]

6

In [23]:
type(x_train), type(y_train), type(x_val), type(y_val)

(scipy.sparse._csr.csr_matrix,
 numpy.ndarray,
 scipy.sparse._csr.csr_matrix,
 numpy.ndarray)

In [24]:
list(range(100, 10_000, 1000))

[100, 1100, 2100, 3100, 4100, 5100, 6100, 7100, 8100, 9100]

In [28]:
def objective(trial):
    param = {
        "verbosity": 0,
        'objective': 'binary:logistic',
        # "num_class": y_train.shape[1],
        "device": "cuda",
        "tree_method": "hist",
        "eval_metric": "auc",
        'n_estimators': 10_000, # Number of boosting rounds.
        
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]), # defines booster, gblinear for linear functions.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),# L2 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),# L1 regularization weight.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),# sampling ratio for training data.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),# sampling according to each tree.
    }

    if param["booster"] in ["gbtree", "dart"]:
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)# maximum depth of the tree, signifies complexity of the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)# minimum child weight, larger the term more conservative the tree.
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)# defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    # Add a callback for pruning.
    # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    # base_est = xgb.train(param, dtrain, evals=[(dval, "validation")], callbacks=[pruning_callback])
    base_est = xgb.XGBClassifier(**param,n_jobs=-1)
                                #  callbacks=[pruning_callback])
    # chain = ClassifierChain(base_estimator=base_est, cv=StratifiedKFold(n_splits=5, shuffle=True), random_state=42, verbose=False).fit(x_train_subset, y_train_subset)
    MOclf = MultiOutputClassifier(base_est, n_jobs=-1).fit(x_train_subset, y_train_subset)
    # base_est.fit(x_train_subset, y_train_subset, eval_set=dval)
    # preds = chain.predict(x_val)
    preds = MOclf.predict(x_val)
    # preds = base_est.predict(dval)
    # pred_labels = np.rint(preds)
    return roc_auc_score(y_val, preds)

oc_auc_ovo', 'r2', 'positive_likelihood_ratio', 'neg_negative_likelihood_ratio', 'f1_macro', 'd2_absolute_error_score', 'adjusted_mutual_info_score', 'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted', 'neg_mean_squared_error', 'precision_micro', 'neg_mean_poisson_deviance', 'recall_macro', 'fowlkes_mallows_score', 'normalized_mutual_info_score', 'neg_mean_absolute_percentage_error', 'max_error', 'precision_samples', 'f1_weighted', 'rand_score', 'recall_samples', 'jaccard_samples', 'explained_variance', 'top_k_accuracy', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'v_measure_score', 'matthews_corrcoef', 'neg_mean_squared_log_error', 'precision', 'neg_mean_absolute_error', 'recall_weighted', 'neg_log_loss', 'homogeneity_score', 'accuracy', 'jaccard_weighted', 'neg_root_mean_squared_log_error', 'average_precision', 'jaccard', 'f1_samples', 'precision_weighted', 'neg_mean_gamma_deviance', 'roc_auc', 'jaccard_micro', 'balanced_accuracy', 'neg_brier_score', 'roc_auc_ovr', 'f1_micro', 'completeness_score', 'recall_micro'}, a callable or None. Got 'roc_auc_score' instead.

In [29]:
STUDY_NAME = f"XGB_MOClf_subset{n_samples}_OPT_{max_features}"
STUDY_NAME

'XGB_MOClf_subset50000_OPT_5000'

In [30]:
%%time
# !rm -rf db_XGB_Clf_subset50000_OPT_5000.sqlite3
!rm -rf db_XGB_MOClf_subset50000_OPT_5000.sqlite3
# !rm -rf db_XGB_CCClf_subset50000_OPT_5000.sqlite3


study = optuna.create_study(storage=f"sqlite:///db_{STUDY_NAME}.sqlite3",
                            sampler=optuna.samplers.RandomSampler(seed=42),
                            # pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
                            study_name=STUDY_NAME,
                            load_if_exists=True,
                            direction='maximize')
study.optimize(objective, 
               n_trials=100,
               timeout=None,
               n_jobs = 1,
               gc_after_trial=True,
               show_progress_bar = True)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print(f"  Value: {Fore.YELLOW}{trial.value}{Style.RESET_ALL}")
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-07-07 22:56:46,325] Trial 1 finished with value: 0.5 and parameters: {'booster': 'gbtree', 'lambda': 1.4610865886287176e-08, 'alpha': 0.574485163632042, 'subsample': 0.8659541126403374, 'colsample_bytree': 0.36987128854262097, 'max_depth': 3, 'min_child_weight': 3, 'eta': 2.716051144654844e-06, 'gamma': 0.00015777981883364995, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.5285365904950203.
[I 2024-07-07 23:04:53,925] Trial 2 finished with value: 0.5 and parameters: {'booster': 'gbtree', 'lambda': 8.528933855762793e-06, 'alpha': 4.452048365748842e-05, 'subsample': 0.8281407691144109, 'colsample_bytree': 0.3597390257266878, 'max_depth': 7, 'min_child_weight': 7, 'eta': 2.3528990899815284e-08, 'gamma': 0.0007250347382396634, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.5285365904950203.
[I 2024-07-07 23:05:50,410] Trial 3 finished with value: 0.7108843183309643 and parameters: {'booster': 'gblinear', 'lambda': 2.734702913886802e-06, 'alpha': 6.0447300703

KeyboardInterrupt: 

In [None]:
#    33.6s optimize =  1, MO = -1
# 1min 13s optimize = -1, MO =  1
# 2min 59s optimize = -1, MO =  6

In [None]:
best_params = study.best_params
best_params

In [None]:
STUDY_NAME

In [None]:
loaded_study = optuna.load_study(study_name=STUDY_NAME, 
                                 storage=f'sqlite:///db_{STUDY_NAME}.sqlite3')
best_params = loaded_study.best_params
best_params

In [None]:
study.best_value

In [None]:
MODEL_NAME = STUDY_NAME
SUB_CSV_NAME = f"sub_{MODEL_NAME}.csv"
SUB_CSV_MSG = "Optuna"

cmd1 = f"head {SUB_CSV_NAME}"
cmd2 = f'kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f {SUB_CSV_NAME} -m "{SUB_CSV_MSG}"'
print(SUB_CSV_NAME)
print(cmd1)
print(cmd2)

In [None]:
%%time
param_dict = {
        "verbosity": 0,
        'objective': 'binary:logistic',
        "device": "cuda",
        "tree_method": "hist",
        "eval_metric": "auc"}
param_dict.update(best_params)
param_dict

In [None]:
base_est = xgb.XGBClassifier(**param_dict,n_jobs=-1)
chain = ClassifierChain(base_estimator=base_est, 
                        cv=StratifiedKFold(n_splits=5, shuffle=True), 
                        random_state=42, verbose=True).fit(x_train_subset, y_train_subset)
# MOclf = MultiOutputClassifier(base_est, n_jobs=-1).fit(x_train, y_train)

In [None]:
train_pred = chain.predict(x_train)
val_pred = chain.predict(x_val)

print("Train ROC:", roc_auc_score(y_train,train_pred))
print("VAl ROC:", roc_auc_score(y_val,val_pred))
print("Train Acc:", accuracy_score(y_train, train_pred))
print("Val Acc:", accuracy_score(y_val, val_pred))

print(classification_report(y_train, train_pred))
print(classification_report(y_val, val_pred))

In [None]:
# sample_sub_df.loc[:, comments_classes] = np.array(MOclf.predict_proba(x_test))[:,:,1].T

In [None]:
chain.predict_proba(x_test).shape

In [None]:
sample_sub_df.loc[:, comments_classes] = chain.predict_proba(x_test)

In [None]:
sample_sub_df

In [None]:
sample_sub_df.to_csv(SUB_CSV_NAME, index=False)
subprocess.run(cmd1, shell=True)

In [None]:
subprocess.run(cmd2, shell=True)