In [1]:
import pandas as pd
import numpy as np

# Below option allows us to see the entire comment_text column
pd.set_option('display.max_colwidth', None)
# Read in the dataset
train = pd.read_csv("../../data/kaggle_train.csv")
train = train.drop(columns=['id'])

labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

print("Stats of training set: ", train.shape)
print("Labels:", labels)

Stats of training set:  (159571, 7)
Labels: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [2]:
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"Explanation\r\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


# Text Preprocessing

Below I have noticed some inconsistencies in the data and by preprocessing it, we can ensure a clean dataset.

In [3]:
# Convert comment to lowercase
def to_lowercase(text):
    return text.lower()

train['comment_text'] = train['comment_text'].apply(to_lowercase)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [4]:
import re
# Remove HTML tags from the comments
def remove_html(text):
    return re.sub(r"<.*>", "", text, flags=re.MULTILINE)
    
train['comment_text'] = train['comment_text'].apply(remove_html)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [5]:
# Remove links from the comments
def remove_links(text):
    text= re.sub(r"http\S+"," ",text, flags=re.MULTILINE)
    return re.sub(r"www\S+"," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_links)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [6]:
import string
# Remove punctuation marks 
def remove_punctuation(text):
    for i in string.punctuation:
        text = text.replace(i, "")
    return text

train['comment_text'] = train['comment_text'].apply(remove_punctuation)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired now892053827,0,0,0,0,0,0
1,daww he matches this background colour im seemingly stuck with thanks talk 2151 january 11 2016 utc,0,0,0,0,0,0
2,hey man im really not trying to edit war its just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,0,0,0,0,0,0
3,\r\nmore\r\ni cant make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up its listed in the relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,you sir are my hero any chance you remember what page thats on,0,0,0,0,0,0


In [7]:
# Remove special characters such as: \n \r \t
def remove_special(text):
    return re.sub(r"[\n\t\\\/\r]"," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_special)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation why the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired now892053827,0,0,0,0,0,0
1,daww he matches this background colour im seemingly stuck with thanks talk 2151 january 11 2016 utc,0,0,0,0,0,0
2,hey man im really not trying to edit war its just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,0,0,0,0,0,0
3,more i cant make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up its listed in the relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,you sir are my hero any chance you remember what page thats on,0,0,0,0,0,0


In [8]:
# Remove stopwords using nltk's stopwords package
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

train['comment_text'] = train['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
train.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired now892053827,0,0,0,0,0,0
1,daww matches background colour im seemingly stuck thanks talk 2151 january 11 2016 utc,0,0,0,0,0,0
2,hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info,0,0,0,0,0,0
3,cant make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later noone else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,sir hero chance remember page thats,0,0,0,0,0,0


In [9]:
# As you can see above, there are numbers and/or dates
# I will remove those as they are not helpful

def remove_numbers(text):
    return re.sub(r'\d'," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_numbers)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired now,0,0,0,0,0,0
1,daww matches background colour im seemingly stuck thanks talk january utc,0,0,0,0,0,0
2,hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info,0,0,0,0,0,0
3,cant make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later noone else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,sir hero chance remember page thats,0,0,0,0,0,0


# Train, Test, and Validation Split

Below I need to split the dataset into train and test datasets.
However, sklearn's `train_test_split` function does not work for
multi-class classification.

Therefore, I will be creating a train, test, and validation split for each label in the classification.

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train[["comment_text"]], train[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]], test_size=0.20)
X_train

Unnamed: 0,comment_text
37225,redirect talkjohn carrington cox
46764,welcome hello welcome wikipedia thank contributions hope like place decide stay pages might find helpful five pillars wikipedia edit page help pages tutorial write great article manual style hope enjoy editing wikipedian please sign name talk pages using four tildes automatically produce name date need help check wikipediaquestions ask talk page ask question place helpme question talk page welcome george crockett didnt realize continuing edit noticed malformed image link extremely short sections cheers ≠ wiser
100078,tldr ill take word
43112,become reality nfl grants team los angeles area
48440,northern cyprus country google search northern cyprus country results definition country wp country may independent sovereign state one occupied another state nonsovereign formerly sovereign political division geographic region associated sets previously independent differently associated peoples distinct political characteristics hence definition country exceeds un membership sources un cite northern cyprus different country well world happiness report united nations sustainable development solutions network sdsn ranked northern cyprus th among countries directly republic cyprus ranked th un sdsn world happiness report p country rankings also international statisticsmentioningsreferences northern cyprus regarded different country gallup happiness index countries c nc nc better gallup healthways country wellbeing rankings c nc nc better ifes election guide country profile northern cyprus amazon northern cyprus country coat arms talinn tech university country specific requirements northern cyprus northern cyprus travel new country within country state rankings ideas northern cyprus northern cyprus walking ramblers worldwide holidays north cyprus country great warmth k country kennels northern cyprus citta slow countries turkish republic northern cyprus first visit unrecognized country northern cyprus official government web pages country turkish republic northern cyprus northern cyprus country set repeal sodomy law birminghammail northern cyprus – country gesis country cyprus northern cyprus northern cyprus country set banners northern cyprus transition country country road nw northern cyprus books comparative egovernment christopher g reddic country background information northern cyprus
...,...
121747,hello thanks responding quickly could please forward mail permissionsen wikimediaorg license properly confirmed hope wording explicit way everybody allowed use wikipedia otherwise wed unfortunately forced decline see wpnfc reasons managed get real free license image good job beautiful picture btw standard welcome blurb case welcome hello jaanmatti welcome wikipedia thank contributions hope like place decide stay pages might find helpful five pillars wikipedia tutorial edit page develop articles create first article using article wizard wish manual style hope enjoy editing wikipedian please sign messages discussion pages using four tildes automatically insert username date need help check wikipediaquestions ask talk page ask question page place help question welcome ☼
5129,way would know added links planet kater middle ton would bully removec evreyone links judged every single one links basis helped felt suicidal devastating therte mercy bang destroy even read repaet pages people make intial choice accept page accepted pages wrong say spam looked every page added link removed destructive vindictive cruel
89975,sorry dont see need percentages infobox articles either ive taken talk page
117686,f fing bluray version fantasia coming f leave alone youre fing british guy doesnt get anything goes america


In [11]:
print("Train shape:",X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (127656, 1)
Test shape: (31915, 1)


# Vectorizing the Comment Text

*Logistic Regression can't take text values as input*

Since the independent variable I have is only text, we will need to use a vectorizer to convert the text into usable data for Logistic Regression.

```

# Max_features = Build a vocabulary that only consider the top max_features ordered by term frequency

# Analyzer = Whether the feature should be made of word or character n-grams. Option ‘char_wb’ creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space.

# ngram_range = (1,1) means only unigrams, (1,2) means unigrams and bigrams, (1,3) means unigrams, bigrams, and trigrams

# Further ngrams knowledge = bigrams means it will learn the occurence of every two words, trigrams would be every 3, etc.

# dtype = type of the matrix returned, default is float64
```

We will use a word and char n-grams as some people like to obfuscate words by using multiple characters, by using both we can hope to catch these.
The idea from this came from [here](https://www.kaggle.com/code/tunguz/logistic-regression-with-words-and-char-n-grams/comments) which has one of the best results for Logistic Regression. This user optimized the ngram_range.

We use FeatureUnion (similar to how hstack works in previous non-Pipeline example) to combine the word and char n-ngrams as described in this [post](https://stackoverflow.com/questions/65765954/word-and-char-ngram-with-different-ngram-range-on-tfidfvectorizer-pipeline) into one feature.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

cols_trans = ColumnTransformer([
    ("txt_word", TfidfVectorizer(max_features=10000, analyzer="word", ngram_range=(1,3), dtype=np.float32), 'comment_text'),
    ("txt_char", TfidfVectorizer(max_features=10000, analyzer="char", ngram_range=(3,6), dtype=np.float32), 'comment_text')
])

## Pipeline 

Create a Pipeline for the data to flow through:

TFIDF Vectorize the data

then

Perform Logistic Regression

In [13]:
from sklearn.pipeline import Pipeline
from sklearn import svm
from mlxtend.feature_selection import ColumnSelector

pipe = Pipeline([
    ('trans', cols_trans),
    ('clf', svm.SVC())
])

In [14]:
from sklearn import set_config
set_config(display='diagram')
# with display='diagram', simply use display() to see the diagram
display(pipe)
# if desired, set display back to the default
set_config(display='text')

# Hyperparameter Tuning with GridSearchCV

Below I will build and train the `Logistic Regression` model and check if the model is overfit, underfit, or optimal fit using GridSearch I will find the best hyperparameters.

We create a new model for each label in order to classify the multi-class, example we cross validate the `toxic` label, then the `severe_toxic` and so on. This method is the preferred method based on previous implementations for the Kaggle competition.

By  doing this, we can evaluate the percentage for each label and choose the highest label(s) which we should classify the text as. For example, in the data we have data which may be `toxic` and `obscene` rather than only `toxic` data and only `obscene` data.

In [15]:
# speecify parameter values to search
params = {}
params['clf__C'] = [100, 10, 1.0, 0.1, 0.01, 0.001]
params['clf__gamma'] = [1, 0.1, 0.01, 0.001]
params['clf__kernel'] = ['linear', 'rbf']

In [16]:
from sklearn.model_selection import GridSearchCV
import time
import warnings
# warnings.filterwarnings('ignore') 

best_results = {}

for label in labels:
    start = time.time()

    grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
    grid.fit(X_train, y_train[label])

    best_results[label] = {
        "score": grid.best_score_,
        "parameters": grid.best_params_,
        "estimator": grid.best_estimator_
    }

    print(f"Time to tune [{label}]: {time.time() - start}")
    print(f"\tBest Score: {grid.best_score_}")
    print(f"\tFinal Model:: {grid.best_estimator_}")


In [None]:
final_results = pd.DataFrame.from_dict(best_results)
final_results.head(6)

In [None]:
final_results.iloc[0].mean()

# Saving the Best Models

In [None]:
import joblib

for k, v in best_results.items():
    print(k, v['estimator'])
    joblib.dump(v['estimator'], f'F:/Thesis/models/svm/{k}.pkl')

# Loading the Saved Models

In [None]:
for label in labels:
    model = joblib.load(open(f'F:/Thesis/models/svm/{label}.pkl', 'rb'))