In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import scipy

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from collections import Counter

from skmultilearn.model_selection import iterative_train_test_split

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, roc_auc_score, hamming_loss, confusion_matrix, ConfusionMatrixDisplay

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# from sklearnex import patch_sklearn
# patch_sklearn()

In [2]:
# Read in the pickle file

with open('../saved_file/after_eda.pickle', 'rb') as f:
    df_train = pickle.load(f)

df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,non_toxic,word_count,unique_word_count,upper_case_word_count,stop_word_count,punctuation_count,title_word_count,sentence_count,percent_unique_word_count,percent_upper_case_word_count,percent_punctuation_count,cleaned_comment_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,43,41,2,20,10,11,5,95.348837,4.651163,23.255814,explanation why the edit make under username h...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,17,17,1,3,12,3,4,100.0,5.882353,70.588235,aww match this background colour seemingly sti...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,42,39,0,21,6,2,4,92.857143,0.0,14.285714,hey man really not try edit war just that this...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,113,82,5,58,21,7,6,72.566372,4.424779,18.584071,more cannot make any real suggestions improvem...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,13,13,0,6,5,2,3,100.0,0.0,38.461538,you sir hero any chance you remember what page...


## Function for modeling

In [3]:
# df_train = df_train.sample(1000)

In [4]:
def gs(feature, pipe, pipe_params, filename):

    X = df_train["cleaned_comment_text"]
    y = df_train[feature]

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring="accuracy", n_jobs=-1)
    # gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring="accuracy")

    gs.fit(X_train, y_train)

    with open(f"../saved_file/{filename}.pickle", "wb") as f:
        pickle.dump(gs, f)

    # y_pred = gs.predict(X_test)

    # # accuracy_score(y_test, y_pred)

    # ConfusionMatrixDisplay(
    #     confusion_matrix=confusion_matrix(y_test, y_pred), display_labels=["0", "1"]
    # ).plot(cmap="Blues")

In [5]:
# Pipeline
tfidf_log_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('lg', LogisticRegression())
])

tfidf_svc_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('svc', LinearSVC())
])

tfidf_nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('nb', ComplementNB())
])

In [6]:
# params
tfidf_log_params = {
    'tfidf__max_features': [2000, 3000, 5000],
    'tfidf__min_df': [2, 3],
    'tfidf__max_df': [.9, .95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'lg__solver': ['liblinear'],
    'lg__C': [0.01, 0.1, 1, 10, 100],
    'lg__penalty': ['l1', 'l2'],
    'lg__max_iter': [1000,2000],
    'lg__class_weight': ['balanced']
}

tfidf_svc_params = {
    'tfidf__max_features': [2000, 3000, 5000],
    'tfidf__min_df': [2, 3],
    'tfidf__max_df': [.9, .95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'svc__C': [0.01,0.1 ,1, 10, 100],
    'svc__max_iter': [10_000],
    # 'svc__dual':[False],
    'svc__class_weight': ['balanced']
}

tfidf_nb_params = {
    'tfidf__max_features': [2000, 3000, 5000],
    'tfidf__min_df': [2, 3],
    'tfidf__max_df': [.9, .95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'nb__alpha': [0.01, 0.1, 1, 10, 100]
}

# combine

In [7]:
log_models = {
    'toxic': 'toxic_tfidf_log',
    'severe_toxic' : 'severe_toxic_tfidf_log',
    'obscene' : 'obscene_tfidf_log',
    'threat' : 'threat_tfidf_log',
    'insult' : 'insult_tfidf_log',
    'identity_hate' : 'identity_hate_tfidf_log'
}

In [8]:
nb_models = {
    'toxic': 'toxic_tfidf_nb',
    'severe_toxic' : 'severe_toxic_tfidf_nb',
    'obscene' : 'obscene_tfidf_nb',
    'threat' : 'threat_tfidf_nb',
    'insult' : 'insult_tfidf_nb',
    'identity_hate' : 'identity_hate_tfidf_nb'
}

In [9]:
svc_models = {
    'toxic': 'toxic_tfidf_svc',
    'severe_toxic' : 'severe_toxic_tfidf_svc',
    'obscene' : 'obscene_tfidf_svc',
    'threat' : 'threat_tfidf_svc',
    'insult' : 'insult_tfidf_svc',
    'identity_hate' : 'identity_hate_tfidf_svc'
}

In [None]:
for k,v in svc_models.items():
    gs(feature=k, pipe=tfidf_svc_pipe, pipe_params=tfidf_svc_params, filename=v)

In [None]:
for k,v in log_models.items():
    gs(feature=k, pipe=tfidf_log_pipe, pipe_params=tfidf_log_params, filename=v)

In [None]:
for k,v in nb_models.items():
    gs(feature=k, pipe=tfidf_nb_pipe, pipe_params=tfidf_nb_params, filename=v)

In [None]:
# # Function to plot confusion matrix and obtain the metrics into a dataframe

# def results_extraction(model_name=""):

#     '''
#     Function to extract the pickle-ed model and to conduct predictions with X_test, saving the final results to a separate dataframe
#     '''
    
#     filename = f'./model_results/{model_name}_trained.pickle'
#     with open(filename, 'rb') as f:
#         model = pickle.load(f)

#     y_pred = model.predict(X_test)

#     tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
#     ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred), display_labels=['learnpython', 'learnmachinelearning']).plot(cmap='Blues')

#     # 0 is python, 1 is ml
#     plt.title(f'{model_name}: Confusion Matrix')
#     plt.savefig(f"./confusion_matrix/{model_name}_confusion_matrix.png", bbox_inches='tight', facecolor='w')
#     plt.close()

#     # Different metrics
#     accuracy = (tp + tn) / (tp + tn + fp + fn)
#     misclassification = 1 - accuracy
#     recall = tp / (tp + fn)
#     specificity = tn / (tn + fp)
#     precision = tp / (tp + fp)
#     f1 = 2 * (precision * recall) / (precision + recall)

#     acc_diff = np.abs((model.score(X_train, y_train) - model.score(X_test, y_test))) / model.score(X_train, y_train)

#     # Metrics for roc curve and auc
#     pred_prob = model.predict_proba(X_test)
#     train_prob = model.predict_proba(X_train)

#     # fpr, tpr, thresh = roc_curve(y_test, pred_prob[:,1], pos_label=1)
#     pred_auc_score = roc_auc_score(y_test, pred_prob[:,1])
#     train_auc_score = roc_auc_score(y_train, train_prob[:,1])
#     auc_diff = np.abs((train_auc_score - pred_auc_score)) / train_auc_score

#     # Append all above results to 'results' dictionary
#     results[model_name] = [
#         # model.best_params_,
#         model.score(X_train, y_train),
#         model.score(X_test, y_test),
#         misclassification,
#         recall,
#         specificity,
#         precision,
#         f1,
#         train_auc_score,
#         pred_auc_score,
#         auc_diff,
#         acc_diff
#     ]