In [1]:
import pandas as pd
import numpy as np

# Below option allows us to see the entire comment_text column
pd.set_option('display.max_colwidth', None)
# Read in the dataset
train = pd.read_csv("../../data/kaggle_train.csv")
train = train.drop(columns=['id'])

labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

print("Stats of training set: ", train.shape)
print("Labels:", labels)

Stats of training set:  (159571, 7)
Labels: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [2]:
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"Explanation\r\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


# Text Preprocessing

Below I have noticed some inconsistencies in the data and by preprocessing it, we can ensure a clean dataset.

If the text is in the same case, it is easy for a machine to interpret the words, as uppercase and lowercase are treated differently by the same machine.

In [3]:
# Convert comment to lowercase
def to_lowercase(text):
    return text.lower()

train['comment_text'] = train['comment_text'].apply(to_lowercase)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In these comments, I have noticed if someone inserts an image in their comment, we will see an <img> tag, we must remove that as it does not reflect any toxicity.

In [4]:
import re
# Remove HTML tags from the comments
def remove_html(text):
    return re.sub(r"<.*>", "", text, flags=re.MULTILINE)
    
train['comment_text'] = train['comment_text'].apply(remove_html)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


Links are invaluable information for the model to learn.

In [5]:
# Remove links from the comments
def remove_links(text):
    text= re.sub(r"http\S+"," ",text, flags=re.MULTILINE)
    return re.sub(r"www\S+"," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_links)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [6]:
# Remove special characters such as: \n \r \t
def remove_special(text):
    return re.sub(r"[\n\t\\\/\r]"," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_special)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation why the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,""" more i can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know. there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


NLTK's stopwords are the most commonly occuring words in a text that do not provide any valuable information, so we will remove them.

In [7]:
# Remove stopwords using nltk's stopwords package
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

train['comment_text'] = train['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
train.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation edits made username hardcore metallica fan reverted? vandalisms, closure gas voted new york dolls fac. please remove template talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! matches background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really trying edit war. guy constantly removing relevant information talking edits instead talk page. seems care formatting actual info.",0,0,0,0,0,0
3,""" can't make real suggestions improvement - wondered section statistics later on, subsection """"types accidents"""" -i think references may need tidying exact format ie date format etc. later on, no-one else first - preferences formatting style references want please let know. appears backlog articles review guess may delay reviewer turns up. listed relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, hero. chance remember page that's on?",0,0,0,0,0,0


Digits create a problem for machines to understand, it is easier to remove them than to keep them.

In [8]:
# As you can see above, there are numbers and/or dates
# I will remove those as they are not helpful

def remove_numbers(text):
    return re.sub(r'\d'," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_numbers)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation edits made username hardcore metallica fan reverted? vandalisms, closure gas voted new york dolls fac. please remove template talk page since i'm retired now. . . .",0,0,0,0,0,0
1,"d'aww! matches background colour i'm seemingly stuck with. thanks. (talk) : , january , (utc)",0,0,0,0,0,0
2,"hey man, i'm really trying edit war. guy constantly removing relevant information talking edits instead talk page. seems care formatting actual info.",0,0,0,0,0,0
3,""" can't make real suggestions improvement - wondered section statistics later on, subsection """"types accidents"""" -i think references may need tidying exact format ie date format etc. later on, no-one else first - preferences formatting style references want please let know. appears backlog articles review guess may delay reviewer turns up. listed relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, hero. chance remember page that's on?",0,0,0,0,0,0


# Train, Test, and Validation Split

Below I need to split the dataset into train and test datasets.
However, sklearn's `train_test_split` function does not work for
multi-class classification.

Therefore, I will be creating a train, test, and validation split for each label in the classification.

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train[["comment_text"]], train[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]], test_size=0.20)
X_train

Unnamed: 0,comment_text
131470,allen ave ocean side ny need tom cruise drinner eat send money tia nancycruise cruz write book hime much ticke show oprah winefey show whant whant tolet hime see book bout hime much dream bout hime much like meet hime much dream bout hime much sat see hime tom cruise sat funny fanny oprah winfey cut nall side let tom cruise see nall cut tham much call wihte wihe nice care meet tom cruise
120466,"cast vs. character info formatting - aiw commenting address one thing directly related (though somewhat applicable) debated article. disagree adding bullets form break delineation indicates difficulty reading paragraph. something often saw used printed encyclopedias couple decades back, depending flow information details. however, maintain paragraph format avoid lists, feel material need overhauled. currently reads (which may well problem), delineation felt necessary. thus, think entry question could revision production notes character notes completely separate actor character list. two cents."
123718,absolutely out-of-line do. posted also backed something discussion thread. believe normal protocol put warning going level im. absolutely out-of-line posting resent it. allow personal point view skewer article pov punish editor. please cautious future.
60991,also im fat unfunny crosseyed faggot nerd
19363,""" le grand, innnocent. continue act like victim here? twisted many things around me. talked ran talk page you, that's case right about. cases assume (collectarian judgesurreal's talk pages): mention name once. instantly assuming things helping matters. clear avoided either, list ring honor events afd (which believe explained with: """"i'm part wrestling project"""") one example. project members required visit post afds project's scope. could've restraint there, didn't. fact come harass block, also bad faith view. response mcjeff's rfc link provided, see section: wikipedia:requests_for_comment robj #findings. found mcjeff part problem. remember: takes two conflict, evidence provided rfc shows mcjeff innocent either. ignored things, would different story. part: mcjeff harassed me, egged get reaction me. also, le grand's repost ani comments productive view. people see ani post clearly, there's need redundant. """
...,...
107753,""" used last several hours, days, memory biochem correct. carboxylic metabolite hangs around, thc. pure thc means recent usage. autopsy report actually awhile now, one """"bloody knuckles"""", media jump it. maybe milking thing daily news cycle """
44696,""" example neutrality? heads [header, new topic] user borealdreams asked help here. experiencing wp:boomerang. one question; may assume """"my client"""" comment one typical edit summaries indication coi? that's sounded like me. guy macon (talk) client installing opgw purposes, nothing selection materials - presumably client knows picked opgw. mere facts sway wikieditor. besides, i'm well known pay twisty bulb cartel. wtshymanski (talk) comes up, happy attest fact never seen hint coi edits. disagreements you, need repeat here. saw comment talk page makes think coi editing going on. borealdreams wrote """"if 'flat earth' mentality removed, even need mention products directly name"""". going take coin - probably tomorrow, bit busy today. btw, re: """"why talk page subpage user space even exist?"""", pretty sure redirect talk pages main talk page. see next super secret twisty bulb cartel meeting? guy macon (talk) """
135851,"entire page violation npov. bought, paid monitored purposes promoting subject material."
156020,reply deliberately removed reply talk page. screw you.


In [13]:
print("Train shape:",X_train.shape)
print("Test shape:", X_test.shape)


for label in labels:
    count_zero = (y_train[label] == 0).sum()
    count_one = (y_train[label] == 1).sum()
    print(f"------{label}-------")
    print("Count zero:", count_zero)
    print("Count one:", count_one)

Train shape: (127656, 1)
Test shape: (31915, 1)
------toxic-------
Count zero: 115487
Count one: 12169
------severe_toxic-------
Count zero: 126418
Count one: 1238
------obscene-------
Count zero: 120947
Count one: 6709
------threat-------
Count zero: 127277
Count one: 379
------insult-------
Count zero: 121364
Count one: 6292
------identity_hate-------
Count zero: 126557
Count one: 1099


# Vectorizing the Comment Text

*Logistic Regression can't take text values as input*

Since the independent variable I have is only text, we will need to use a vectorizer to convert the text into usable data for Logistic Regression.

```

# Max_features = Build a vocabulary that only consider the top max_features ordered by term frequency

# Analyzer = Whether the feature should be made of word or character n-grams. Option ‘char_wb’ creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space.

# ngram_range = (1,1) means only unigrams, (1,2) means unigrams and bigrams, (1,3) means unigrams, bigrams, and trigrams

# Further ngrams knowledge = bigrams means it will learn the occurence of every two words, trigrams would be every 3, etc.

# dtype = type of the matrix returned, default is float64
```

We will use a word and char n-grams as some people like to obfuscate words by using multiple characters, by using both we can hope to catch these.
The idea from this came from [here](https://www.kaggle.com/code/tunguz/logistic-regression-with-words-and-char-n-grams/comments) which has one of the best results for Logistic Regression. This user optimized the ngram_range.

We use FeatureUnion (similar to how hstack works in previous non-Pipeline example) to combine the word and char n-ngrams as described in this [post](https://stackoverflow.com/questions/65765954/word-and-char-ngram-with-different-ngram-range-on-tfidfvectorizer-pipeline) into one feature.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

cols_trans = ColumnTransformer([
    ("txt_word", TfidfVectorizer(max_features=10000, analyzer="word", ngram_range=(1,3), dtype=np.float32), 'comment_text'),
    ("txt_char", TfidfVectorizer(max_features=10000, analyzer="char", ngram_range=(3,6), dtype=np.float32), 'comment_text')
])

## Pipeline 

Create a Pipeline for the data to flow through:

TFIDF Vectorize the data

then

Perform Logistic Regression

In [12]:
from sklearn.pipeline import Pipeline
from sklearn import svm
from mlxtend.feature_selection import ColumnSelector

pipe = Pipeline([
    ('trans', cols_trans),
    ('clf', svm.SVC())
],
    memory="tmp/cache")

In [13]:
from sklearn import set_config
set_config(display='diagram')
# with display='diagram', simply use display() to see the diagram
display(pipe)
# if desired, set display back to the default
set_config(display='text')

# Hyperparameter Tuning with GridSearchCV

Below I will build and train the `Logistic Regression` model and check if the model is overfit, underfit, or optimal fit using GridSearch I will find the best hyperparameters.

We create a new model for each label in order to classify the multi-class, example we cross validate the `toxic` label, then the `severe_toxic` and so on. This method is the preferred method based on previous implementations for the Kaggle competition.

By  doing this, we can evaluate the percentage for each label and choose the highest label(s) which we should classify the text as. For example, in the data we have data which may be `toxic` and `obscene` rather than only `toxic` data and only `obscene` data.

In [14]:
# speecify parameter values to search
params = {}
params['clf__C'] = [100, 10, 1.0, 0.1, 0.01, 0.001]
# params['clf__gamma'] = [1, 0.1, 0.01, 0.001]
params['clf__kernel'] = ['linear']

In [15]:
from sklearn.model_selection import RandomizedSearchCV
import time
import warnings
# warnings.filterwarnings('ignore') 

best_results = {}

for label in labels:
    start = time.time()

    grid = RandomizedSearchCV(pipe, params, n_iter=1, cv=5, scoring='accuracy')
    grid.fit(X_train, y_train[label])

    best_results[label] = {
        "score": grid.best_score_,
        "parameters": grid.best_params_,
        "estimator": grid.best_estimator_
    }

    print(f"Time to tune [{label}]: {time.time() - start}")
    print(f"\tBest Score: {grid.best_score_}")
    print(f"\tFinal Model:: {grid.best_estimator_}")


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS 

Time to tune [toxic]: 18879.89657306671
	Best Score: 0.9327489456852021
	Final Model:: Pipeline(memory='tmp/cache',
         steps=[('trans',
                 ColumnTransformer(transformers=[('txt_word',
                                                  TfidfVectorizer(dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  'comment_text'),
                                                 ('txt_char',
                                                  TfidfVectorizer(analyzer='char',
                                                                  dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                    

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS 

Time to tune [severe_toxic]: 3928.839028596878
	Best Score: 0.9900435547958694
	Final Model:: Pipeline(memory='tmp/cache',
         steps=[('trans',
                 ColumnTransformer(transformers=[('txt_word',
                                                  TfidfVectorizer(dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  'comment_text'),
                                                 ('txt_char',
                                                  TfidfVectorizer(analyzer='char',
                                                                  dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                             

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS 

Time to tune [obscene]: 34358.574857234955
	Best Score: 0.974940467509423
	Final Model:: Pipeline(memory='tmp/cache',
         steps=[('trans',
                 ColumnTransformer(transformers=[('txt_word',
                                                  TfidfVectorizer(dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  'comment_text'),
                                                 ('txt_char',
                                                  TfidfVectorizer(analyzer='char',
                                                                  dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                  

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS 

Time to tune [threat]: 3931.175914287567
	Best Score: 0.9968509156906338
	Final Model:: Pipeline(memory='tmp/cache',
         steps=[('trans',
                 ColumnTransformer(transformers=[('txt_word',
                                                  TfidfVectorizer(dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  'comment_text'),
                                                 ('txt_char',
                                                  TfidfVectorizer(analyzer='char',
                                                                  dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                   

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS 

Time to tune [insult]: 10762.297089338303
	Best Score: 0.9505859498505235
	Final Model:: Pipeline(memory='tmp/cache',
         steps=[('trans',
                 ColumnTransformer(transformers=[('txt_word',
                                                  TfidfVectorizer(dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  'comment_text'),
                                                 ('txt_char',
                                                  TfidfVectorizer(analyzer='char',
                                                                  dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                  

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS 

Time to tune [identity_hate]: 12826.2699944973
	Best Score: 0.9909209125113498
	Final Model:: Pipeline(memory='tmp/cache',
         steps=[('trans',
                 ColumnTransformer(transformers=[('txt_word',
                                                  TfidfVectorizer(dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  'comment_text'),
                                                 ('txt_char',
                                                  TfidfVectorizer(analyzer='char',
                                                                  dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                             

In [16]:
final_results = pd.DataFrame.from_dict(best_results)
final_results.head(6)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
score,0.932749,0.990044,0.97494,0.996851,0.950586,0.990921
parameters,"{'clf__kernel': 'linear', 'clf__C': 0.01}","{'clf__kernel': 'linear', 'clf__C': 0.01}","{'clf__kernel': 'linear', 'clf__C': 10}","{'clf__kernel': 'linear', 'clf__C': 10}","{'clf__kernel': 'linear', 'clf__C': 0.001}","{'clf__kernel': 'linear', 'clf__C': 0.01}"
estimator,"(ColumnTransformer(transformers=[('txt_word',\n TfidfVectorizer(dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(1, 3)),\n 'comment_text'),\n ('txt_char',\n TfidfVectorizer(analyzer='char',\n dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(3, 6)),\n 'comment_text')]), SVC(C=0.01, kernel='linear'))","(ColumnTransformer(transformers=[('txt_word',\n TfidfVectorizer(dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(1, 3)),\n 'comment_text'),\n ('txt_char',\n TfidfVectorizer(analyzer='char',\n dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(3, 6)),\n 'comment_text')]), SVC(C=0.01, kernel='linear'))","(ColumnTransformer(transformers=[('txt_word',\n TfidfVectorizer(dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(1, 3)),\n 'comment_text'),\n ('txt_char',\n TfidfVectorizer(analyzer='char',\n dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(3, 6)),\n 'comment_text')]), SVC(C=10, kernel='linear'))","(ColumnTransformer(transformers=[('txt_word',\n TfidfVectorizer(dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(1, 3)),\n 'comment_text'),\n ('txt_char',\n TfidfVectorizer(analyzer='char',\n dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(3, 6)),\n 'comment_text')]), SVC(C=10, kernel='linear'))","(ColumnTransformer(transformers=[('txt_word',\n TfidfVectorizer(dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(1, 3)),\n 'comment_text'),\n ('txt_char',\n TfidfVectorizer(analyzer='char',\n dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(3, 6)),\n 'comment_text')]), SVC(C=0.001, kernel='linear'))","(ColumnTransformer(transformers=[('txt_word',\n TfidfVectorizer(dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(1, 3)),\n 'comment_text'),\n ('txt_char',\n TfidfVectorizer(analyzer='char',\n dtype=<class 'numpy.float32'>,\n max_features=10000,\n ngram_range=(3, 6)),\n 'comment_text')]), SVC(C=0.01, kernel='linear'))"


In [17]:
final_results.iloc[0].mean()

0.972681791007167

# Saving the Best Models

In [18]:
import joblib

for k, v in best_results.items():
    print(k, v['estimator'])
    joblib.dump(v['estimator'], f'F:/Thesis/models/svm/{k}.pkl')

toxic Pipeline(memory='tmp/cache',
         steps=[('trans',
                 ColumnTransformer(transformers=[('txt_word',
                                                  TfidfVectorizer(dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  'comment_text'),
                                                 ('txt_char',
                                                  TfidfVectorizer(analyzer='char',
                                                                  dtype=<class 'numpy.float32'>,
                                                                  max_features=10000,
                                                                  ngram_range=(3,
                                                   

# Loading the Saved Models

In [19]:
for label in labels:
    model = joblib.load(open(f'F:/Thesis/models/svm/{label}.pkl', 'rb'))