# Natural Language Processing with RNNs - Toxic Comment Classification

Outline:

1. Download and explore the data
2. Prepare the data for training
3. Build a recurrent neural network
4. Train & evaluate the model
5. Make predictions & submit to Kaggle

In [1]:
import os
import joblib
from pathlib import Path
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

from tqdm.auto import tqdm
import copy
from colorama import Fore, Style
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline

import xgboost as xgb
from scipy.stats import mode

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from torchinfo import summary

# import gdown; gdown.download(id="15Gm26M6_tmM7A9HNEqgLXeOS_KUKyXJv")
from helper_functions import *

device = "cuda" if torch.cuda.is_available() else 'cpu'
print("device:", device)
print("Cpu cores found:", os.cpu_count())
print(f"CUDA cores found: {get_cuda_cores()}")

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from colorama import Fore, Style

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report

import optuna

device: cuda
Cpu cores found: 64
CUDA cores found: 4096


## Download & Explore Data

Outline:
1. Download the data from Kaggle
2. Load data into Pandas dataframes
3. Explore the dataset

In [2]:
# def download_Kaggle_data(cmd: str, update:bool=False):
#     data_dir = Path("/home/23m1521/datasets")
#     print(data_dir)
    
#     cmd = f"{cmd} -p {data_dir}"
#     print(cmd)
    
#     name = cmd.split(" ")[4:][0]
#     print(name)
    
#     print(data_dir/name)
#     print(os.path.exists(data_dir/name))
#     if not os.path.exists(data_dir/name):
#         print("done!")
#     elif update:
#         subprocess.run(cmd, shell=True)
#         print("updated!")
    
    
    
# download_Kaggle_data("kaggle competitions download -c jigsaw-toxic-comment-classification-challenge")

In [3]:
train_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/train.csv.zip")
test_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test.csv.zip")
test_labels_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/test_labels.csv.zip")
sample_sub_fname = Path("/home/23m1521/datasets/jigsaw-toxic-comment/sample_submission.csv.zip")

In [4]:
# %load_ext cudf.pandas
import pandas as pd
train_df_full = pd.read_csv(train_fname)
train_df_full

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [5]:
train_df_full

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [6]:
comments_classes = list(train_df_full.columns[2:])
comments_classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
for i in comments_classes:
    print(100*train_df_full[i].value_counts(normalize=True)); print()

toxic
0    90.415552
1     9.584448
Name: proportion, dtype: float64

severe_toxic
0    99.000445
1     0.999555
Name: proportion, dtype: float64

obscene
0    94.705178
1     5.294822
Name: proportion, dtype: float64

threat
0    99.700447
1     0.299553
Name: proportion, dtype: float64

insult
0    95.063639
1     4.936361
Name: proportion, dtype: float64

identity_hate
0    99.119514
1     0.880486
Name: proportion, dtype: float64



In [8]:
test_df = pd.read_csv(test_fname)
test_df

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [9]:
test_labels = pd.read_csv(test_labels_fname)
test_labels

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,-1,-1,-1,-1,-1,-1
153160,fffd7a9a6eb32c16,-1,-1,-1,-1,-1,-1
153161,fffda9e8d6fafa9e,-1,-1,-1,-1,-1,-1
153162,fffe8f1340a79fc2,-1,-1,-1,-1,-1,-1


In [10]:
sample_sub_df = pd.read_csv(sample_sub_fname)
sample_sub_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.5,0.5,0.5,0.5,0.5,0.5
153160,fffd7a9a6eb32c16,0.5,0.5,0.5,0.5,0.5,0.5
153161,fffda9e8d6fafa9e,0.5,0.5,0.5,0.5,0.5,0.5
153162,fffe8f1340a79fc2,0.5,0.5,0.5,0.5,0.5,0.5


### Create Training & Validation Sets

- Define a custom Pytorch Dataset
- Pass raw data into the dataset
- Split the PyTorch Dataset

In [11]:
train_df_full

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


### Conversion to TF-IDF Vectors

In [12]:
def NLP1(text):
    text_tok = word_tokenize(text)
    
    eng_stopwords = stopwords.words('english')
    text_stp = [word for word in text_tok if (word.lower() not in eng_stopwords) and word.isalpha()]
    
    stemmer = SnowballStemmer(language='english')
    text_stm = [stemmer.stem(word) for word in text_stp]
    return text_stm

In [13]:
max_features = 5000 # or None

In [14]:
# %%time
# eng_stopwords = stopwords.words('english')

# vectorizer = TfidfVectorizer(lowercase=True, 
#                              tokenizer=NLP1,
#                              stop_words=eng_stopwords,
#                              ngram_range=(1,2),
#                              max_features=max_features).fit(train_df_full.comment_text)

### Split training and validation set

In [15]:
total_val_fold_auc = []
test_predictions = []

In [16]:
comments_classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [17]:
train_df, val_df = train_test_split(train_df_full, test_size=0.3, random_state=43)
train_df.shape, val_df.shape

((111699, 8), (47872, 8))

In [18]:
for i in comments_classes:
    print("class:", i)
    
    value_counts = dict(train_df[i].value_counts())
    print("value_counts:", value_counts)
    
    class_wg = compute_class_weight(class_weight="balanced", classes=np.array(np.unique(train_df[i])), y=train_df[i])
    print("class_wg:", class_wg, type(class_wg))
    
    counts = np.array(list(value_counts.values()))
    print(counts)
    
    print((counts*class_wg).sum()/counts.sum())
    
    print()

class: toxic
value_counts: {0: 100994, 1: 10705}
class_wg: [0.5529982  5.21714152] <class 'numpy.ndarray'>
[100994  10705]
1.0

class: severe_toxic
value_counts: {0: 110596, 1: 1103}
class_wg: [ 0.50498662 50.63417951] <class 'numpy.ndarray'>
[110596   1103]
1.0

class: obscene
value_counts: {0: 105789, 1: 5910}
class_wg: [0.52793296 9.45      ] <class 'numpy.ndarray'>
[105789   5910]
1.0

class: threat
value_counts: {0: 111369, 1: 330}
class_wg: [  0.50148156 169.24090909] <class 'numpy.ndarray'>
[111369    330]
1.0

class: insult
value_counts: {0: 106194, 1: 5505}
class_wg: [ 0.52591954 10.14523161] <class 'numpy.ndarray'>
[106194   5505]
1.0

class: identity_hate
value_counts: {0: 110687, 1: 1012}
class_wg: [ 0.50457145 55.18725296] <class 'numpy.ndarray'>
[110687   1012]
1.0



In [19]:
# x_train = vectorizer.transform(train_df.comment_text)
# x_val = vectorizer.transform(val_df.comment_text)
# x_test = vectorizer.transform(test_df.comment_text)
# y_train = train_df[comments_classes].to_numpy()
# y_val = val_df[comments_classes].to_numpy()

# x_train.shape, y_train.shape, x_val.shape

In [20]:
# joblib.dump(x_train, f'x_train_{max_features}.joblib')
# joblib.dump(x_val, f'x_val_{max_features}.joblib')
# joblib.dump(x_test, f'x_test_{max_features}.joblib')
# joblib.dump(y_train, f'y_train_{max_features}.joblib')
# joblib.dump(y_val, f'y_val_{max_features}.joblib')

In [21]:
x_train = joblib.load(f'x_train_{max_features}.joblib')
x_val = joblib.load(f'x_val_{max_features}.joblib')
x_test = joblib.load(f'x_test_{max_features}.joblib')
y_train = joblib.load(f'y_train_{max_features}.joblib')
y_val = joblib.load(f'y_val_{max_features}.joblib')

x_train.shape, y_train.shape, x_val.shape

((111699, 5000), (111699, 6), (47872, 5000))

In [22]:
np.random.seed(42)
n_samples = 50000
sample_idx = np.random.randint(0,x_train.shape[0], n_samples)
sample_idx

array([ 15795,    860, 103694, ...,  22623,  91471,    148])

In [23]:
x_train_subset = x_train[sample_idx,:]
y_train_subset = y_train[sample_idx,:]
x_train_subset.shape, y_train_subset.shape

((50000, 5000), (50000, 6))

In [41]:
def objective(trial):
    param_dict = {
     'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
     'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
     'max_depth': trial.suggest_int('max_depth', 10, 50),
     'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
     'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
     'bootstrap': trial.suggest_categorical('bootstrap', [True, False])   
    }
    
    base_est = RandomForestClassifier(**param_dict, n_jobs=-1)
    # chain = ClassifierChain(base_estimator=base_est,
    #                         cv=StratifiedKFold(n_splits=5, shuffle=True), 
    #                         random_state=42, 
    #  			              verbose=True)
    MOclf = MultiOutputClassifier(base_est, n_jobs=-1)
    score = cross_val_score(MOclf, 
                            x_train_subset, y_train_subset, 
                            cv=5, n_jobs=-1, 
                            scoring='roc_auc_ovo_weighted').mean()
    
    return score

oc_auc_ovo', 'r2', 'positive_likelihood_ratio', 'neg_negative_likelihood_ratio', 'f1_macro', 'd2_absolute_error_score', 'adjusted_mutual_info_score', 'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted', 'neg_mean_squared_error', 'precision_micro', 'neg_mean_poisson_deviance', 'recall_macro', 'fowlkes_mallows_score', 'normalized_mutual_info_score', 'neg_mean_absolute_percentage_error', 'max_error', 'precision_samples', 'f1_weighted', 'rand_score', 'recall_samples', 'jaccard_samples', 'explained_variance', 'top_k_accuracy', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'v_measure_score', 'matthews_corrcoef', 'neg_mean_squared_log_error', 'precision', 'neg_mean_absolute_error', 'recall_weighted', 'neg_log_loss', 'homogeneity_score', 'accuracy', 'jaccard_weighted', 'neg_root_mean_squared_log_error', 'average_precision', 'jaccard', 'f1_samples', 'precision_weighted', 'neg_mean_gamma_deviance', 'roc_auc', 'jaccard_micro', 'balanced_accuracy', 'neg_brier_score', 'roc_auc_ovr', 'f1_micro', 'completeness_score', 'recall_micro'}, a callable or None. Got 'roc_auc_score' instead.

In [42]:
STUDY_NAME = f'RFC_MO_subset{n_samples}'
STUDY_NAME

'RFC_MO_subset50000'

In [43]:
%%time
!rm -rf db_RFC_MO_subset50000.sqlite3


study = optuna.create_study(storage=f"sqlite:///db_{STUDY_NAME}.sqlite3",
                            sampler=optuna.samplers.RandomSampler(seed=42),
                            pruner=None,
                            study_name=STUDY_NAME,
                            load_if_exists=False,
                            direction='maximize')
study.optimize(objective, 
               n_trials=50,
               timeout=None,
               n_jobs = -1,
               show_progress_bar = True)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-07-08 01:16:15,698] A new study created in RDB with name: RFC_MO_subset50000


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-07-08 01:19:09,680] Trial 17 finished with value: 0.9519240729841438 and parameters: {'n_estimators': 462, 'criterion': 'log_loss', 'max_depth': 11, 'min_samples_split': 26, 'min_samples_leaf': 31, 'bootstrap': True}. Best is trial 17 with value: 0.9519240729841438.
[I 2024-07-08 01:20:18,815] Trial 26 finished with value: 0.9628209325333055 and parameters: {'n_estimators': 159, 'criterion': 'gini', 'max_depth': 49, 'min_samples_split': 14, 'min_samples_leaf': 31, 'bootstrap': False}. Best is trial 26 with value: 0.9628209325333055.
[I 2024-07-08 01:21:02,798] Trial 18 finished with value: 0.9603242777831446 and parameters: {'n_estimators': 238, 'criterion': 'log_loss', 'max_depth': 39, 'min_samples_split': 23, 'min_samples_leaf': 31, 'bootstrap': False}. Best is trial 26 with value: 0.9628209325333055.
[I 2024-07-08 01:22:04,541] Trial 6 finished with value: 0.9566958897819928 and parameters: {'n_estimators': 785, 'criterion': 'entropy', 'max_depth': 14, 'min_samples_split': 2



[I 2024-07-08 01:35:05,745] Trial 16 finished with value: 0.96984934535855 and parameters: {'n_estimators': 837, 'criterion': 'log_loss', 'max_depth': 33, 'min_samples_split': 4, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 16 with value: 0.96984934535855.
[I 2024-07-08 01:35:14,719] Trial 4 finished with value: 0.9558986685807935 and parameters: {'n_estimators': 830, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 25, 'min_samples_leaf': 28, 'bootstrap': True}. Best is trial 16 with value: 0.96984934535855.
[I 2024-07-08 01:35:35,940] Trial 35 finished with value: 0.9575626774633594 and parameters: {'n_estimators': 852, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 21, 'min_samples_leaf': 27, 'bootstrap': False}. Best is trial 16 with value: 0.96984934535855.
[I 2024-07-08 01:35:42,937] Trial 9 finished with value: 0.9704119596233092 and parameters: {'n_estimators': 977, 'criterion': 'log_loss', 'max_depth': 50, 'min_samples_split': 8, 'min_samples

In [44]:
# loaded_study = optuna.load_study(study_name=STUDY_NAME, storage=f'sqlite:///db_{STUDY_NAME}.sqlite3')

In [45]:
# loaded_study.best_params

In [46]:
best_params = study.best_params
best_params

{'n_estimators': 644,
 'criterion': 'entropy',
 'max_depth': 45,
 'min_samples_split': 9,
 'min_samples_leaf': 2,
 'bootstrap': False}

In [47]:
print("[3] Training Chassifier Chain MLP...")

MODEL_NAME = f"RF_MO_subset{n_samples}_OPT_{max_features}"
SUB_CSV_NAME = f"sub_{MODEL_NAME}.csv"
SUB_CSV_MSG = "Optuna"

cmd1 = f"head {SUB_CSV_NAME}"
cmd2 = f'kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f {SUB_CSV_NAME} -m "{SUB_CSV_MSG}"'
print(SUB_CSV_NAME)
print(cmd1)
print(cmd2)

[3] Training Chassifier Chain MLP...
sub_RF_MO_subset50000_OPT_5000.csv
head sub_RF_MO_subset50000_OPT_5000.csv
kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f sub_RF_MO_subset50000_OPT_5000.csv -m "Optuna"


In [48]:
%%time
base_est = RandomForestClassifier(**best_params, n_jobs=-1)
MOclf = MultiOutputClassifier(base_est, n_jobs=-1).fit(x_train, y_train)
# chain = ClassifierChain(base_estimator=base_est,
#                         cv=StratifiedKFold(n_splits=5, shuffle=True), 
#                         random_state=42, 
#                         verbose=True).fit(x_train, y_train)

CPU times: user 1.02 s, sys: 1.64 s, total: 2.67 s
Wall time: 1min 11s


In [49]:
print("[4] Evaluating Model...")

train_pred = MOclf.predict(x_train)
val_pred = MOclf.predict(x_val)

print("Train ROC:", roc_auc_score(y_train,train_pred))
print("VAl ROC:", roc_auc_score(y_val,val_pred))
print("Train Acc:", accuracy_score(y_train, train_pred))
print("Val Acc:", accuracy_score(y_val, val_pred))

print(classification_report(y_train, train_pred))
print(classification_report(y_val, val_pred))

[4] Evaluating Model...




Train ROC: 0.6273487588089334
VAl ROC: 0.5870752593506543
Train Acc: 0.9184415258865344
Val Acc: 0.9072735628342246
              precision    recall  f1-score   support

           0       1.00      0.43      0.60     10705
           1       0.99      0.09      0.17      1103
           2       0.99      0.52      0.68      5910
           3       1.00      0.02      0.04       330
           4       0.97      0.39      0.55      5505
           5       0.98      0.08      0.15      1012

   micro avg       0.99      0.41      0.58     24565
   macro avg       0.99      0.25      0.36     24565
weighted avg       0.99      0.41      0.57     24565
 samples avg       0.04      0.03      0.04     24565

              precision    recall  f1-score   support

           0       0.98      0.35      0.52      4589
           1       0.67      0.01      0.02       492
           2       0.95      0.40      0.57      2539
           3       0.00      0.00      0.00       148
           4    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
sample_sub_df.loc[:, comments_classes] = np.array(MOclf.predict_proba(x_test))[:,:,1].T

In [51]:
# sample_sub_df.loc[:, comments_classes] = chain.predict_proba(x_test)

In [52]:
sample_sub_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.865012,0.265644,0.779454,0.053189,0.661186,0.155013
1,0000247867823ef7,0.078005,0.005845,0.035568,0.001193,0.036261,0.005805
2,00013b17ad220c46,0.064079,0.004110,0.027560,0.000707,0.026273,0.003550
3,00017563c3f7919a,0.037377,0.001953,0.016055,0.000566,0.017824,0.002787
4,00017695ad8997eb,0.057845,0.002990,0.023135,0.000558,0.022983,0.002491
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.083306,0.005087,0.033177,0.001079,0.034459,0.005439
153160,fffd7a9a6eb32c16,0.112989,0.005985,0.042201,0.008761,0.045240,0.008916
153161,fffda9e8d6fafa9e,0.046867,0.002818,0.020173,0.000760,0.015368,0.002627
153162,fffe8f1340a79fc2,0.064738,0.002803,0.026027,0.000981,0.024739,0.008987


In [53]:
sample_sub_df.to_csv(SUB_CSV_NAME, index=False)
subprocess.run(cmd1, shell=True)

id,toxic,severe_toxic,obscene,threat,insult,identity_hate
00001cee341fdb12,0.8650122147361102,0.2656443311190198,0.7794538691842526,0.053189037727743145,0.6611862266599641,0.15501332977078386
0000247867823ef7,0.07800464136831232,0.0058447637408382686,0.03556810189070701,0.001193278053934944,0.03626111608847509,0.005804692406949774
00013b17ad220c46,0.0640790710502305,0.0041100784541772345,0.027560486179634303,0.0007071473101238555,0.026272650447477078,0.0035503779382297846
00017563c3f7919a,0.03737678847439773,0.0019534143288880883,0.01605495601339796,0.0005655969416818128,0.017824028159291183,0.0027866913060588753
00017695ad8997eb,0.05784539325746123,0.00298971319623878,0.023134574636193495,0.0005575570223554812,0.022982984679109825,0.0024912148408489155
0001ea8717f6de06,0.03231836235379318,0.001118406461621669,0.013164781277338172,0.0004882544097959356,0.014677693675664594,0.0024431578604193135
00024115d4cbde0f,0.011315858256109278,0.0005009334650275258,0.005852557121821651,0.000340160

CompletedProcess(args='head sub_RF_MO_subset50000_OPT_5000.csv', returncode=0)

In [54]:
!iitb status

seem to be logged in


In [55]:
subprocess.run(cmd2, shell=True)

100%|██████████| 20.7M/20.7M [00:14<00:00, 1.52MB/s]  


Successfully submitted to Toxic Comment Classification Challenge

CompletedProcess(args='kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f sub_RF_MO_subset50000_OPT_5000.csv -m "Optuna"', returncode=0)