# Developing end to end classifiers

In [11]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import re
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
import warnings
warnings.filterwarnings('ignore')

lemmatizer = WordNetLemmatizer()

In [2]:
data = pd.read_csv('data/train_comment_small.csv')
print(data.shape)
data.head()

(159571, 2)


Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [3]:
data['toxic'].value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

## Cleaning text

In [7]:
stop_words = stopwords.words('english')

In [8]:
data['cleaned_text'] = data['comment_text'].apply(lambda x : ' '.join([lemmatizer.lemmatize(word.lower()) \
    for word in word_tokenize(re.sub(r'([^\s\w]|_)+', ' ', str(x))) if word.lower() not in stop_words]))

In [9]:
data.head()

Unnamed: 0,comment_text,toxic,cleaned_text
0,Explanation\nWhy the edits made under my usern...,0,explanation edits made username hardcore metal...
1,D'aww! He matches this background colour I'm s...,0,aww match background colour seemingly stuck th...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man really trying edit war guy constantly ...
3,"""\nMore\nI can't make any real suggestions on ...",0,make real suggestion improvement wondered sect...
4,"You, sir, are my hero. Any chance you remember...",0,sir hero chance remember page


## Tfidf vectorization

In [10]:
tfidf_model = TfidfVectorizer(max_features=200)
tfidf = tfidf_model.fit_transform(data['cleaned_text']).todense()
tfidf_df = pd.DataFrame(tfidf,columns = tfidf_model.get_feature_names())
print(tfidf_df.shape)
tfidf_df.head()

(159571, 200)


Unnamed: 0,account,actually,add,added,agree,already,also,another,anyone,anything,...,wiki,wikipedia,without,word,work,world,would,wp,wrong,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.149468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Dividing into train and validation

In [12]:
X_train,X_test,y_train,y_test = train_test_split(tfidf_df,data['toxic'],test_size=0.1,random_state=1)
print(X_train.shape,X_test.shape)

(143613, 200) (15958, 200)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,roc_auc_score


def show_performance(model,X_train,y_train,X_test,y_test,roc=False):
    model.fit(X_train,y_train)
    print('Training accuracy - ',accuracy_score(model.predict(X_train),y_train))
    print('Validation accuracy - ',accuracy_score(model.predict(X_test),y_test))
    
    if roc:
        print('')
        print('Training ROC - ',roc_auc_score(y_train,model.predict_proba(X_train)[:,1]))
        print('Test ROC - ',roc_auc_score(y_test,model.predict_proba(X_test)[:,1]))
        

def run_classification_models(X_train,y_train,X_test,y_test,roc=True):
    models = [LogisticRegression(), RandomForestClassifier(),XGBClassifier()]
    model_names = ['Logistic Regression','Random Forest','XGBclassifier']
    
    for i in range(len(models)):
        print('-------------------------------------------')
        print('For ',model_names[i],' -')
        print('')
        show_performance(models[i],X_train,y_train,X_test,y_test,roc)
        print('-------------------------------------------')

In [16]:
run_classification_models(X_train,y_train,X_test,y_test)

-------------------------------------------
For  Logistic Regression  -

Training accuracy -  0.9284674785708815
Validation accuracy -  0.9284371475122196

Training ROC -  0.8666140681082339
Test ROC -  0.8722822502871892
-------------------------------------------
-------------------------------------------
For  Random Forest  -

Training accuracy -  0.9706711787930062
Validation accuracy -  0.927183857626269

Training ROC -  0.9840911721738117
Test ROC -  0.8465498512643417
-------------------------------------------
-------------------------------------------
For  XGBclassifier  -

Training accuracy -  0.9331676101745664
Validation accuracy -  0.9290637924551949

Training ROC -  0.9104520296658722
Test ROC -  0.8713766513380892
-------------------------------------------
