# Toxic Classification Notebook

## Import Libraries

In [10]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from datetime import datetime
import pickle

## Load Data

In [11]:
train = pd.read_csv('data/original_data/toxic_comments.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Explore Data

In [12]:
x=train.iloc[:,2:].sum()
#marking comments without any tags as "clean"
rowsums=train.iloc[:,2:].sum(axis=1)
train['clean']=(rowsums==0)
#count number of clean entries
#train['clean'].sum()
print("Total comments = ",len(train))
print("Total clean comments = ",train['clean'].sum())
print("Total tags =",x.sum())

Total comments =  159571
Total clean comments =  143346
Total tags = 35098


In [13]:
print("Check for missing values in Train dataset")
null_check=train.isnull().sum()
print(null_check)
# print("Check for missing values in Test dataset")
# null_check=test.isnull().sum()
# print(null_check)
print("filling NA with \"unknown\"")
train["comment_text"].fillna("unknown", inplace=True)

Check for missing values in Train dataset
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
clean            0
dtype: int64
filling NA with "unknown"


## Preprocess

In [14]:
train['dirty'] = train['clean'].replace({False:1, True:0})
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,dirty
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,True,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,True,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,True,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,True,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,True,0


In [15]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    return text

In [16]:
train['comment_text'] = train['comment_text'].apply(clean_text)



In [17]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,dirty
0,0000997932d777bf,explanationwhy edits made username hardcore me...,0,0,0,0,0,0,True,0
1,000103f0d9cfb60f,daww matches background colour im seemingly st...,0,0,0,0,0,0,True,0
2,000113f07ec002fd,hey man im really trying edit war guy constant...,0,0,0,0,0,0,True,0
3,0001b41b1c6bb37e,morei cant make real suggestions improvement w...,0,0,0,0,0,0,True,0
4,0001d958c54c6e35,sir hero chance remember page thats,0,0,0,0,0,0,True,0


In [18]:
X = train['comment_text']
y = train['dirty']

### Count Vectorizer

In [19]:
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

In [20]:
# Save Vectorizer File
today = str(datetime.date(datetime.now()))
pickle.dump(vectorizer, open('models/vectorizers/vectorizer' + today, 'wb'))

### Resampling

In [21]:
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_sample(X_vec, y)

### Train Test Split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state = 42)

## Model

### Logistic Regression

In [32]:
logreg = Pipeline([
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=10000))
               ])

#logreg = LogisticRegression(n_jobs=1, C=1e5, max_iter=10000)

toxic_model = logreg.fit(X_train, y_train)

%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

CPU times: user 10 µs, sys: 2 µs, total: 12 µs
Wall time: 7.15 µs
accuracy 0.8684129429892141
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      4890
           1       0.87      0.87      0.87      4845

    accuracy                           0.87      9735
   macro avg       0.87      0.87      0.87      9735
weighted avg       0.87      0.87      0.87      9735



## Predictions

In [33]:
def predict_toxic(text):
    x = [text]
    x_vec = vectorizer.transform(x)
    pred_sentiment = toxic_model.predict(x_vec)[0]
    prob = np.max(np.round(toxic_model.predict_proba(x_vec), 8))
    
    return pred_sentiment, prob

In [34]:
sample_q = 'fuck you bitch'
predict_toxic(sample_q)

(1, 1.0)

## Save Model

In [35]:
today = str(datetime.date(datetime.now()))

<class 'str'>


In [None]:
print

In [38]:
pickle.dump(toxic_model, open('toxic_classification_model' + today, 'wb'))