### Prepare Library Installation

In [None]:
! pip install --user scipy nltk imbalanced_learn

### Download Dataset from Kaggle

In [None]:
! kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification
! unzip jigsaw-unintended-bias-in-toxicity-classification.zip

In [None]:
import numpy as np
import pandas as pd
import json, nltk

In [2]:
selected_columns = ['id','comment_text','target']
total_data = pd.read_csv("./train.csv",usecols=selected_columns)

In [3]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1804874 entries, 0 to 1804873
Data columns (total 3 columns):
 #   Column        Dtype  
---  ------        -----  
 0   id            int64  
 1   target        float64
 2   comment_text  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 41.3+ MB


In [4]:
selected_columns = ['id','comment_text','toxicity']
final_test_data = pd.read_csv("./test_public_expanded.csv",usecols=selected_columns)
final_test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97320 entries, 0 to 97319
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            97320 non-null  int64  
 1   comment_text  97320 non-null  object 
 2   toxicity      97320 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


## Text Preprocessing

##### Import [Contractions](https://stackoverflow.com/a/19794953/8141330) for Preprocessing

In [5]:
with open('./contractions.json', 'r') as f:
    contractions_dict = json.load(f)
contractions = contractions_dict['contractions']

##### Define a function which handles emoji classifications

In [6]:
def emoji(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :') , :O
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:O)', ' positiveemoji ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' positiveemoji ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' positiveemoji ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-; , @-)
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;|@-\))', ' positiveemoji ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:, :-/ , :-|
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:|:-/|:-\|)', ' negativeemoji ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' negativeemoji ', tweet)
    return tweet

##### Define a function which handles all preprocessing

In [7]:
import re


def process_tweet(tweet):
    tweet = str(tweet).lower()  # Lowercases the string
    tweet = re.sub("@[^\s]+", "", tweet)  # Removes usernames
    tweet = re.sub("((www\.[^\s]+)|(https?://[^\s]+))", " ", tweet)  # Remove URLs
    tweet = re.sub(r"\d+", " ", str(tweet))  # Removes all digits

    for word in tweet.split():
        if word.lower() in contractions:
            tweet = tweet.replace(
                word, contractions[word.lower()]
            )  # Replaces contractions

    tweet = re.sub("&quot;", " ", tweet)  # Remove (&quot;)
    tweet = emoji(tweet)  # Replaces Emojis
    tweet = re.sub(r"\b[a-zA-Z]\b", "", str(tweet))  # Removes all single characters

    tweet = re.sub(r"[^\w\s]", " ", str(tweet))  # Removes all punctuations
    tweet = re.sub(
        r"(.)\1+", r"\1\1", tweet
    )  # Convert more than 2 letter repetitions to 2 letter
    tweet = re.sub(r"\s+", " ", str(tweet))  # Replaces double spaces with single space
    return tweet

### Preprocess text using process_tweet function

In [8]:
total_data['processed_text'] = np.vectorize(process_tweet)(total_data["comment_text"])

In [9]:
final_test_data['processed_text'] = np.vectorize(process_tweet)(final_test_data["comment_text"])

### Load Stop Words

In [10]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

english_stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = CountVectorizer(ngram_range=(1,1))
vectorizer = CountVectorizer(stop_words=english_stop_words)
# vectorizer = TfidfVectorizer(use_idf=True,ngram_range=(1,1), max_features=20000, stop_words=english_stop_words)
vectorizer.fit(total_data['processed_text'])

processed_text_vector = vectorizer.transform(total_data['processed_text'])

In [11]:
processed_text_vector

<1804874x261309 sparse matrix of type '<class 'numpy.int64'>'
	with 42463570 stored elements in Compressed Sparse Row format>

### Save vectorizer for future use as picke file

In [57]:
import pickle

with open('vectorizer_count_no_stop_words.pkl', 'wb') as fout:
    pickle.dump(vectorizer, fout)

### Load vectorizer from pickle file

In [13]:
import pickle

with open('vectorizer_count_no_stop_words.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [14]:
processed_text_vector = vectorizer.transform(total_data['processed_text'])

In [15]:
processed_text_final_vector = vectorizer.transform(final_test_data['processed_text'])

### Transform target label to binary label, 1=toxic, 0=non-toxic

In [16]:
# Transform target label to binary label, 1=toxic, 0=non-toxic
total_data["label"] = np.where(total_data["target"] > 0.5, 1, 0)

In [17]:
# Transform target label to binary label, 1=toxic, 0=non-toxic
final_test_data["label"] = np.where(final_test_data["toxicity"] > 0.5, 1, 0)

### Split dataset for training and testing

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(processed_text_vector, total_data["label"],
                                                    test_size=0.2, random_state=69)

#### Set dataset for final testing

In [19]:
X_test_final = processed_text_final_vector
y_test_final = final_test_data["label"]

## Split Dataset Using Random Undersample and Oversample Strategy

In [20]:
from imblearn.over_sampling import RandomOverSampler

random_oversample = RandomOverSampler(random_state=69,sampling_strategy='minority')
X_train_oversample, y_train_oversample = random_oversample.fit_resample(X_train, y_train)

In [21]:
from imblearn.under_sampling import RandomUnderSampler
random_undersample = RandomUnderSampler(random_state=69,sampling_strategy='majority')
X_train_undersample, y_train_undersample = random_undersample.fit_resample(X_train, y_train)

## Training Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
import datetime

start = datetime.datetime.now()
model_lr = LogisticRegression(C=1.0, random_state=69, solver='sag', max_iter=1000, n_jobs=-1).fit(X_train, y_train)
predicted_lr = model_lr.predict(X_test)

print("training time",datetime.datetime.now() - start)

### Save model as pickle file

In [None]:
import pickle

with open('model_lr_partial.pkl', 'wb') as fout:
    pickle.dump(model_lr, fout)

### Load model from pickle file

In [22]:
import pickle

with open('model_lr_partial.pkl', 'rb') as f:
    model_lr = pickle.load(f)

In [23]:
predicted_lr = model_lr.predict(X_test)

In [24]:
predicted_lr_final = model_lr.predict(X_test_final)

### Performance Report - Logistic Regression

In [25]:
from sklearn.metrics import classification_report

print("report lr base")
print(classification_report(y_test, predicted_lr))


from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_lr))

report lr base
              precision    recall  f1-score   support

           0       0.97      0.99      0.98    339536
           1       0.76      0.45      0.56     21439

    accuracy                           0.96    360975
   macro avg       0.86      0.72      0.77    360975
weighted avg       0.95      0.96      0.95    360975

balanced_accuracy_score 0.7194135882344745


In [26]:
from sklearn.metrics import classification_report

print("report lr base on final test set")
print(classification_report(y_test_final, predicted_lr_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_lr_final))

report lr base on final test set
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     91671
           1       0.75      0.46      0.57      5649

    accuracy                           0.96     97320
   macro avg       0.86      0.72      0.77     97320
weighted avg       0.96      0.96      0.96     97320

balanced_accuracy_score 0.7241368992475128


## Training Logistic Regression using Oversampled Data

In [None]:
from sklearn.linear_model import LogisticRegression
import datetime

start = datetime.datetime.now()
model_lr_oversample = LogisticRegression(C=1.0, random_state=69, solver='sag', max_iter=1000, n_jobs=-1).fit(X_train_oversample, y_train_oversample)
predicted_lr_oversample = model_lr_oversample.predict(X_test)

print("training time",datetime.datetime.now() - start)

### Save model as pickle file

In [18]:
import pickle

with open('model_lr_partial_oversample.pkl', 'wb') as fout:
    pickle.dump(model_lr_oversample, fout)

### Load model from pickle file

In [27]:
import pickle

with open('model_lr_partial_oversample.pkl', 'rb') as f:
    model_lr_oversample = pickle.load(f)

In [28]:
predicted_lr_oversample = model_lr_oversample.predict(X_test)

In [29]:
predicted_lr_oversample_final = model_lr_oversample.predict(X_test_final)

### Performance Report - Logistic Regression (Oversampled)

In [30]:
from sklearn.metrics import classification_report

print("report lr oversample")
print(classification_report(y_test, predicted_lr_oversample))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_lr_oversample))

report lr oversample
              precision    recall  f1-score   support

           0       0.99      0.93      0.96    339536
           1       0.42      0.83      0.56     21439

    accuracy                           0.92    360975
   macro avg       0.70      0.88      0.76    360975
weighted avg       0.95      0.92      0.93    360975

balanced_accuracy_score 0.880793107815532


In [31]:
from sklearn.metrics import classification_report

print("report lr overersample on final test set")
print(classification_report(y_test_final, predicted_lr_oversample_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_lr_oversample_final))

report lr overersample on final test set
              precision    recall  f1-score   support

           0       0.99      0.93      0.96     91671
           1       0.42      0.84      0.56      5649

    accuracy                           0.92     97320
   macro avg       0.70      0.89      0.76     97320
weighted avg       0.96      0.92      0.93     97320

balanced_accuracy_score 0.8851825792799533


## Training Logistic Regression using Undersampled Data

In [27]:
from sklearn.linear_model import LogisticRegression
import datetime

start = datetime.datetime.now()
model_lr_undersample = LogisticRegression(C=1.0, random_state=69, solver='sag', max_iter=1000, n_jobs=-1).fit(X_train_undersample, y_train_undersample)
predicted_lr_undersample = model_lr_undersample.predict(X_test)

print("training time",datetime.datetime.now() - start)

### Save model as pickle file

In [15]:
import pickle

with open('model_lr_partial_undersample.pkl', 'wb') as fout:
    pickle.dump(model_lr_undersample, fout)

### Load model from pickle file

In [32]:
import pickle

with open('model_lr_partial_undersample.pkl', 'rb') as f:
    model_lr_undersample = pickle.load(f)

In [33]:
predicted_lr_undersample = model_lr_undersample.predict(X_test)

In [34]:
predicted_lr_undersample_final = model_lr_undersample.predict(X_test_final)

### Performance Report - Logistic Regression (Undersampled)

In [35]:
from sklearn.metrics import classification_report

print("report lr undersample")
print(classification_report(y_test, predicted_lr_undersample))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_lr_undersample))

report lr undersample
              precision    recall  f1-score   support

           0       0.99      0.92      0.95    339536
           1       0.39      0.84      0.53     21439

    accuracy                           0.91    360975
   macro avg       0.69      0.88      0.74    360975
weighted avg       0.95      0.91      0.93    360975

balanced_accuracy_score 0.877678680785448


In [36]:
from sklearn.metrics import classification_report

print("report lr undersample on final test set")
print(classification_report(y_test_final, predicted_lr_undersample_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_lr_undersample_final))

report lr undersample on final test set
              precision    recall  f1-score   support

           0       0.99      0.92      0.95     91671
           1       0.39      0.84      0.53      5649

    accuracy                           0.91     97320
   macro avg       0.69      0.88      0.74     97320
weighted avg       0.95      0.91      0.93     97320

balanced_accuracy_score 0.879569566970637


## Training Naive Bayes

In [37]:
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes Classifier

model_naive = MultinomialNB().fit(X_train, y_train)
predicted_naive = model_naive.predict(X_test)

### Save model as pickle file

In [38]:
import pickle

with open('model_nb_partial.pkl', 'wb') as fout:
    pickle.dump(model_naive, fout)

### Load model from pickle file

In [39]:
import pickle

with open('model_nb_partial.pkl', 'rb') as f:
    model_naive = pickle.load(f)

In [40]:
predicted_naive = model_naive.predict(X_test)

In [41]:
predicted_naive_final = model_naive.predict(X_test_final)

### Performance Report - Naive Bayes

In [42]:
from sklearn.metrics import classification_report

print("report nb base")
print(classification_report(y_test, predicted_naive))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_naive))

report nb base
              precision    recall  f1-score   support

           0       0.97      0.96      0.96    339536
           1       0.42      0.50      0.46     21439

    accuracy                           0.93    360975
   macro avg       0.70      0.73      0.71    360975
weighted avg       0.94      0.93      0.93    360975

balanced_accuracy_score 0.7268720343393581


In [43]:
from sklearn.metrics import classification_report

print("report nb base on final test set")
print(classification_report(y_test_final, predicted_naive_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_naive_final))

report nb base on final test set
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     91671
           1       0.47      0.48      0.48      5649

    accuracy                           0.94     97320
   macro avg       0.72      0.72      0.72     97320
weighted avg       0.94      0.94      0.94     97320

balanced_accuracy_score 0.7228788097322775


## Training Naive Bayes (Oversampled)

In [44]:
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes Classifier

model_naive_oversample = MultinomialNB().fit(X_train_oversample, y_train_oversample)
predicted_naive_oversample = model_naive_oversample.predict(X_test)

### Save model as pickle file

In [45]:
import pickle

with open('model_nb_partial_oversample.pkl', 'wb') as fout:
    pickle.dump(model_naive_oversample, fout)

### Load model from pickle file

In [46]:
import pickle

with open('model_nb_partial_oversample.pkl', 'rb') as f:
    model_naive_oversample = pickle.load(f)

In [47]:
predicted_naive_oversample = model_naive_oversample.predict(X_test)

In [48]:
predicted_naive_oversample_final = model_naive_oversample.predict(X_test_final)

### Performance Report - Naive Bayes (Oversample)

In [49]:
from sklearn.metrics import classification_report

print("report nb random oversample")
print(classification_report(y_test, predicted_naive_oversample))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_naive_oversample))

report nb random oversample
              precision    recall  f1-score   support

           0       0.99      0.73      0.84    339536
           1       0.16      0.86      0.28     21439

    accuracy                           0.73    360975
   macro avg       0.58      0.79      0.56    360975
weighted avg       0.94      0.73      0.80    360975

balanced_accuracy_score 0.7918526157110459


In [50]:
from sklearn.metrics import classification_report

print("report nb oversample on final test set")
print(classification_report(y_test_final, predicted_naive_oversample_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_naive_oversample_final))

report nb oversample on final test set
              precision    recall  f1-score   support

           0       0.99      0.73      0.84     91671
           1       0.17      0.86      0.28      5649

    accuracy                           0.74     97320
   macro avg       0.58      0.80      0.56     97320
weighted avg       0.94      0.74      0.81     97320

balanced_accuracy_score 0.7966972454943804


## Training Naive Bayes (Undersampled)

In [51]:
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes Classifier

model_naive_undersample = MultinomialNB().fit(X_train_undersample, y_train_undersample)
predicted_naive_undersample = model_naive_undersample.predict(X_test)

### Save model as pickle file

In [52]:
import pickle

with open('model_nb_partial_undersample.pkl', 'wb') as fout:
    pickle.dump(model_naive_undersample, fout)

### Load model from pickle file

In [53]:
import pickle

with open('model_nb_partial_undersample.pkl', 'rb') as f:
    model_naive_undersample = pickle.load(f)

In [54]:
predicted_naive_undersample = model_naive_undersample.predict(X_test)

In [55]:
predicted_naive_undersample_final = model_naive_undersample.predict(X_test_final)

### Performance Report - Naive Bayes (Undersample)

In [56]:
from sklearn.metrics import classification_report

print("report nb random undersample")
print(classification_report(y_test, predicted_naive_undersample))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_naive_undersample))

report nb random undersample
              precision    recall  f1-score   support

           0       0.99      0.71      0.83    339536
           1       0.16      0.88      0.27     21439

    accuracy                           0.72    360975
   macro avg       0.58      0.80      0.55    360975
weighted avg       0.94      0.72      0.80    360975

balanced_accuracy_score 0.7950775960415505


In [57]:
from sklearn.metrics import classification_report

print("report nb undersample on final test set")
print(classification_report(y_test_final, predicted_naive_undersample_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_naive_undersample_final))

report nb undersample on final test set
              precision    recall  f1-score   support

           0       0.99      0.72      0.83     91671
           1       0.16      0.88      0.27      5649

    accuracy                           0.73     97320
   macro avg       0.58      0.80      0.55     97320
weighted avg       0.94      0.73      0.80     97320

balanced_accuracy_score 0.7998241280455165


## Training Balanced Random Forest

In [22]:
from imblearn.ensemble import BalancedRandomForestClassifier
import datetime

start = datetime.datetime.now()

model_brf = BalancedRandomForestClassifier(
    n_estimators=100, random_state=69, sampling_strategy="all", replacement=True,
    bootstrap=False, n_jobs=-1
)
model_brf.fit(X_train, y_train)

print("training time",datetime.datetime.now() - start)

predicted_brf = model_brf.predict(X_test)

training time 0:15:00.074344


### Save model as pickle file

In [23]:
import pickle

with open('model_brf_partial.pkl', 'wb') as fout:
    pickle.dump(model_brf, fout)

### Load model from pickle file

In [58]:
import pickle

with open('model_brf_partial.pkl', 'rb') as f:
    model_brf = pickle.load(f)

In [59]:
predicted_brf = model_brf.predict(X_test)

In [60]:
predicted_brf_final = model_brf.predict(X_test_final)

### Performance Report - Balanced Random Forest

In [61]:
from sklearn.metrics import classification_report

print("report brf")
print(classification_report(y_test, predicted_brf))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_brf))

report brf
              precision    recall  f1-score   support

           0       0.98      0.94      0.96    339536
           1       0.43      0.73      0.54     21439

    accuracy                           0.93    360975
   macro avg       0.71      0.84      0.75    360975
weighted avg       0.95      0.93      0.93    360975

balanced_accuracy_score 0.8355757192417335


In [62]:
from sklearn.metrics import classification_report

print("report brf on final test set")
print(classification_report(y_test_final, predicted_brf_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_brf_final))

report brf on final test set
              precision    recall  f1-score   support

           0       0.98      0.94      0.96     91671
           1       0.43      0.73      0.54      5649

    accuracy                           0.93     97320
   macro avg       0.71      0.84      0.75     97320
weighted avg       0.95      0.93      0.94     97320

balanced_accuracy_score 0.8369791726680487


## Training Balanced Random Forest (Undersample)

In [20]:
from imblearn.ensemble import BalancedRandomForestClassifier
import datetime

start = datetime.datetime.now()

model_brf_undersample = BalancedRandomForestClassifier(
    n_estimators=100, random_state=69, sampling_strategy="all", replacement=True,
    bootstrap=False, n_jobs=-1
)
model_brf_undersample.fit(X_train_undersample, y_train_undersample)

print("training time",datetime.datetime.now() - start)

predicted_brf_undersample = model_brf_undersample.predict(X_test)

training time 0:13:42.828406


### Save model as pickle file

In [21]:
import pickle

with open('model_brf_partial_undersample.pkl', 'wb') as fout:
    pickle.dump(model_brf_undersample, fout)

### Load model from pickle file

In [50]:
import pickle

with open('model_brf_partial_undersample.pkl', 'rb') as f:
    model_brf_undersample = pickle.load(f)

In [52]:
predicted_brf_undersample = model_brf_undersample.predict(X_test)

### Performance Report - Balanced Random Forest (Undersample)

In [53]:
from sklearn.metrics import classification_report

print("report brf random undersample")
print(classification_report(y_test, predicted_brf_undersample))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_brf_undersample))

report brf random undersample
              precision    recall  f1-score   support

           0       0.99      0.88      0.93    339536
           1       0.30      0.84      0.45     21439

    accuracy                           0.88    360975
   macro avg       0.65      0.86      0.69    360975
weighted avg       0.95      0.88      0.90    360975

balanced_accuracy_score 0.8602299095010775


## Training Balanced Bagging Classifier (Naive Bayes)

In [60]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.naive_bayes import MultinomialNB
model_bbc_nb = BalancedBaggingClassifier(MultinomialNB(),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=69)

model_bbc_nb.fit(X_train, y_train)

print("training time",datetime.datetime.now() - start)

predicted_bbc_nb = model_bbc_nb.predict(X_test)

training time 0:38:06.360371


### Save model as pickle file

In [61]:
import pickle

with open('model_bbc_nb_partial.pkl', 'wb') as fout:
    pickle.dump(model_bbc_nb, fout)

### Load model from pickle file

In [63]:
import pickle

with open('model_bbc_nb_partial.pkl', 'rb') as f:
    model_bbc_nb = pickle.load(f)

In [64]:
predicted_bbc_nb = model_bbc_nb.predict(X_test)

In [65]:
predicted_bbc_nb_final = model_bbc_nb.predict(X_test_final)

### Performance Report - Balanced Bagging Classifier (Naive Bayes)

In [66]:
from sklearn.metrics import classification_report

print("report bbc_nb")
print(classification_report(y_test, predicted_bbc_nb))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_bbc_nb))

report bbc_nb
              precision    recall  f1-score   support

           0       0.99      0.73      0.84    339536
           1       0.17      0.87      0.28     21439

    accuracy                           0.73    360975
   macro avg       0.58      0.80      0.56    360975
weighted avg       0.94      0.73      0.80    360975

balanced_accuracy_score 0.798936298461086


In [67]:
from sklearn.metrics import classification_report

print("report bbc_nb on final test set")
print(classification_report(y_test_final, predicted_bbc_nb_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_bbc_nb_final))

report bbc_nb on final test set
              precision    recall  f1-score   support

           0       0.99      0.73      0.84     91671
           1       0.17      0.88      0.28      5649

    accuracy                           0.74     97320
   macro avg       0.58      0.80      0.56     97320
weighted avg       0.94      0.74      0.81     97320

balanced_accuracy_score 0.8037337515598814


## Training Random Undersample Boosted Classifier

In [68]:
from imblearn.ensemble import RUSBoostClassifier
model_rusboost = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R',
                              random_state=69)
model_rusboost.fit(X_train, y_train)

print("training time",datetime.datetime.now() - start)

predicted_rusboost = model_rusboost.predict(X_test)

training time 1:35:31.993989


### Save model as pickle file

In [69]:
import pickle

with open('model_rusboost_partial.pkl', 'wb') as fout:
    pickle.dump(model_rusboost, fout)

### Load model from pickle file

In [68]:
import pickle

with open('model_rusboost_partial.pkl', 'rb') as f:
    model_rusboost = pickle.load(f)

In [69]:
predicted_rusboost = model_rusboost.predict(X_test)

In [70]:
predicted_rusboost_final = model_rusboost.predict(X_test_final)

### Performance Report - Random Undersample Boosted Classifier

In [71]:
from sklearn.metrics import classification_report

print("report rusboost")
print(classification_report(y_test, predicted_rusboost))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_rusboost))

report rusboost
              precision    recall  f1-score   support

           0       0.98      0.91      0.95    339536
           1       0.36      0.76      0.48     21439

    accuracy                           0.90    360975
   macro avg       0.67      0.84      0.72    360975
weighted avg       0.95      0.90      0.92    360975

balanced_accuracy_score 0.8361425444207731


In [72]:
from sklearn.metrics import classification_report

print("report rusboost on final test set")
print(classification_report(y_test_final, predicted_rusboost_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_rusboost_final))

report rusboost on final test set
              precision    recall  f1-score   support

           0       0.98      0.92      0.95     91671
           1       0.36      0.77      0.49      5649

    accuracy                           0.91     97320
   macro avg       0.67      0.84      0.72     97320
weighted avg       0.95      0.91      0.92     97320

balanced_accuracy_score 0.8425753876253297


## Training Voting Classifier

In [73]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.preprocessing import LabelEncoder

model_voting_classifier = VotingClassifier(estimators=[
    # ('model_naive', model_naive), 
    # ('model_naive_oversample', model_naive_oversample), 
    # ('model_naive_undersample', model_naive_undersample),
    ('model_lr', model_lr),
    ('model_lr_oversample', model_lr_oversample),
    ('model_lr_undersample', model_lr_undersample),
], voting='soft')
model_voting_classifier.estimators_ = [
    # model_naive, 
    # model_naive_oversample, 
    # model_naive_undersample, 
    model_lr,
    model_lr_oversample,
    model_lr_undersample,
]

model_voting_classifier.le_ = LabelEncoder().fit(y_train)
model_voting_classifier.classes_ = model_voting_classifier.le_.classes_

predicted_voting = model_voting_classifier.predict(X_test)

### Save model as pickle file

In [74]:
import pickle

with open('model_voting_partial_best.pkl', 'wb') as fout:
    pickle.dump(model_voting_classifier, fout)

### Load model from pickle file

In [75]:
import pickle

with open('model_voting_partial_best.pkl', 'rb') as f:
    model_voting_classifier = pickle.load(f)

In [76]:
predicted_voting = model_voting_classifier.predict(X_test)

In [77]:
predicted_voting_final = model_voting_classifier.predict(X_test_final)

### Performance Report - Voting Classifier (NB, NB Oversample, NB Undersample, LR, LR Oversample, LR Undersample)

In [117]:
from sklearn.metrics import classification_report

print("report voting")
print(classification_report(y_test, predicted_voting))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_voting))

report voting
              precision    recall  f1-score   support

           0       0.99      0.93      0.96    339536
           1       0.41      0.79      0.54     21439

    accuracy                           0.92    360975
   macro avg       0.70      0.86      0.75    360975
weighted avg       0.95      0.92      0.93    360975

balanced_accuracy_score 0.8579705335719856


In [118]:
from sklearn.metrics import classification_report

print("report voting on final test set")
print(classification_report(y_test_final, predicted_voting_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_voting_final))

report voting on final test set
              precision    recall  f1-score   support

           0       0.99      0.93      0.96     91671
           1       0.42      0.79      0.55      5649

    accuracy                           0.92     97320
   macro avg       0.70      0.86      0.75     97320
weighted avg       0.95      0.92      0.94     97320

balanced_accuracy_score 0.8634181381497537


### Performance Report - Voting Classifier (LR, LR Oversample, LR Undersample)

In [78]:
from sklearn.metrics import classification_report

print("report voting lr")
print(classification_report(y_test, predicted_voting))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_voting))

report voting lr
              precision    recall  f1-score   support

           0       0.98      0.96      0.97    339536
           1       0.52      0.74      0.61     21439

    accuracy                           0.94    360975
   macro avg       0.75      0.85      0.79    360975
weighted avg       0.96      0.94      0.95    360975

balanced_accuracy_score 0.8480500151927538


In [79]:
from sklearn.metrics import classification_report

print("report voting lr on final test set")
print(classification_report(y_test_final, predicted_voting_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_voting_final))

report voting lr on final test set
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     91671
           1       0.52      0.75      0.61      5649

    accuracy                           0.95     97320
   macro avg       0.75      0.85      0.79     97320
weighted avg       0.96      0.95      0.95     97320

balanced_accuracy_score 0.8532860327547032


## Evaluation - Detoxify (unbiased-small) model

In [80]:
from detoxify import Detoxify
import torch
import pandas as pd

In [81]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_detoxify = Detoxify("unbiased-small", device=device)

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.1.2/unbiased-albert-c8519128.ckpt" to /root/.cache/torch/hub/checkpoints/unbiased-albert-c8519128.ckpt
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44.6M/44.6M [00:01<00:00, 25.8MB/s]


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

### Split Dataset for Detoxify Model Testing

In [82]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(total_data["processed_text"], total_data["label"],
                                                    test_size=0.2, random_state=69)
X_test2_list = X_test2.tolist()

In [83]:
X_test2_final_list = final_test_data['processed_text'].tolist()
y_test2_final = final_test_data['label']

### Predict using Test Set (Without Batch)

In [None]:
detoxify_result_raw = []
print(len(X_test2_list))
for i in range(len(X_test2_list)):
    temp_prediction = model_detoxify.predict(X_test2_list[i])
    df = pd.DataFrame(temp_prediction,index=[0])
    detoxify_result_raw.append(df["toxicity"][0].round(3).astype("float"))
print(len(detoxify_result_raw))

detoxify_result_raw_pd = pd.DataFrame(detoxify_result_raw)
detoxify_result_raw_pd.to_csv('detoxify_result_raw_pd.csv')

### Predict using Final Test Set (Without Batch)

In [None]:
detoxify_result_final_raw = []
print(len(X_test2_list))
for i in range(len(X_test2_list)):
    temp_prediction = model_detoxify.predict(X_test2_list[i])
    df = pd.DataFrame(temp_prediction,index=[0])
    detoxify_result_final_raw.append(df["toxicity"][0].round(3).astype("float"))
print(len(detoxify_result_final_raw))

detoxify_result_final_raw_pd = pd.DataFrame(detoxify_result_final_raw)
detoxify_result_final_raw_pd.to_csv('detoxify_result_final_raw_pd.csv')

In [84]:
detoxify_result_raw_pd = pd.read_csv('detoxify_result_raw_pd.csv')
detoxify_result = np.where(detoxify_result_raw_pd['0'] > 0.5, 1, 0)

### Performance Report - Detoxify (unbiased-small) model

In [85]:
from sklearn.metrics import classification_report

print("report detoxify")
print(classification_report(y_test, detoxify_result))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, detoxify_result))

report detoxify
              precision    recall  f1-score   support

           0       0.99      0.97      0.98    339536
           1       0.63      0.79      0.70     21439

    accuracy                           0.96    360975
   macro avg       0.81      0.88      0.84    360975
weighted avg       0.97      0.96      0.96    360975

balanced_accuracy_score 0.8796666343029867


In [124]:
from sklearn.metrics import classification_report

print("report voting lr")
print(classification_report(y_test, predicted_voting))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test, predicted_voting))

report voting lr
              precision    recall  f1-score   support

           0       0.98      0.96      0.97    339536
           1       0.52      0.74      0.61     21439

    accuracy                           0.94    360975
   macro avg       0.75      0.85      0.79    360975
weighted avg       0.96      0.94      0.95    360975

balanced_accuracy_score 0.8480500151927538


In [86]:
detoxify_result_final_raw_pd = pd.read_csv('detoxify_result_final_raw_pd.csv')
detoxify_result_final = np.where(detoxify_result_final_raw_pd['0'] > 0.5, 1, 0)

In [87]:
from sklearn.metrics import classification_report

print("report detoxify on final test set")
print(classification_report(y_test_final, detoxify_result_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, detoxify_result_final))

report detoxify on final test set
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     91671
           1       0.61      0.77      0.68      5649

    accuracy                           0.96     97320
   macro avg       0.80      0.87      0.83     97320
weighted avg       0.96      0.96      0.96     97320

balanced_accuracy_score 0.8688139637966112


In [125]:
from sklearn.metrics import classification_report

print("report voting lr on final test set")
print(classification_report(y_test_final, predicted_voting_final))

from sklearn.metrics import balanced_accuracy_score
print("balanced_accuracy_score", balanced_accuracy_score(y_test_final, predicted_voting_final))

report voting lr on final test set
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     91671
           1       0.52      0.75      0.61      5649

    accuracy                           0.95     97320
   macro avg       0.75      0.85      0.79     97320
weighted avg       0.96      0.95      0.95     97320

balanced_accuracy_score 0.8532860327547032


## Summary

We have several experiments in toxicity classification using [Kaggle - Jigsaw Unintended Bias in Toxicity Classification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/data) dataset. We have conducted several experiments using traditional Machine Learning model: Logistic Regression, Logistic Regression with Undersampled Data, Logistic Regression with Oversampled Data, Naive Bayes, Naive Bayes with Oversampled Data, Naive Bayes with Undersampled Data, Balanced Random Forest, Bagging Naive Bayes Classifier, RUSBoosted Classifier, and Voting Classifier.

Based on the experiments, best model (Voting Classifier Model using Logistic Regression + Undersample Logistic Regression Variant + Oversample Logistic Regression) can be achieved using traditional machine learning model with comparable performance with state of the art model (Detoxify) result as follows:
- Voting Model achieved **0.95** in accuracy score while Detoxify (unbiased-small) achieve **0.96** in accurace score on final test set data.
- Voting Model achieved **0.853** in balanced_accuracy score while Detoxify (unbiased-small) achieve **0.868** in balanced_accuracy score on final test set data.
- Voting Model achieved **0.79** in macro average F1-score while Detoxify (unbiased-small) achieve **0.83** in macro average F1-score on final test set data.
- Voting Model achieved **0.75** in recall score for toxic class (1) while Detoxify (unbiased-small) achieve **0.77** in balanced_accuracy score on final test set data.
- **Recall score** for toxic class becomes **priority** due to the main focus of toxicity classification.
- Various hyperparameters optimization, other preprocessing methods, and other Machine Learning and/or Deep Learning can be explored in other future experiments.

## References and Related Works

- [Kaggle - Jigsaw Unintended Bias in Toxicity Classification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/data)
- [Github - Detoxify](https://github.com/unitaryai/detoxify)
- [Github (gunjannandy) - Twitter Sentiment Analysis](https://github.com/gunjannandy/twitter-sentiment-analysis)

