<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Testing-to-see-how-well-metadata-alone-predicts-toxicity" data-toc-modified-id="Testing-to-see-how-well-metadata-alone-predicts-toxicity-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Testing to see how well metadata alone predicts toxicity</a></span><ul class="toc-item"><li><span><a href="#Testing-the-performance-of-XGBoost" data-toc-modified-id="Testing-the-performance-of-XGBoost-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Testing the performance of XGBoost</a></span></li><li><span><a href="#testing-using-tfidf-vectorizer" data-toc-modified-id="testing-using-tfidf-vectorizer-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>testing using tfidf vectorizer</a></span></li></ul></li></ul></div>

In [210]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp
import re
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet, RidgeCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

In [161]:
df = pd.read_csv('cleaned.csv')

In [162]:
def downsample(df):
    return df.drop(df[df.is_toxic == 0].sample(len(df[df.is_toxic == 0]) - df.is_toxic.sum()).index)

In [163]:
X_train.columns

Index(['comment_text_char_space', 'question', 'exclamation', 'words',
       'avg_word_len', 'caps_percentage'],
      dtype='object')

# Testing to see how well metadata alone predicts toxicity

In [211]:
# building a function to peform train test split and simple regression on different target variables
def tester(x_values, y_value):
    
    df = pd.read_csv('cleaned.csv')
    
    print('Predictors: {}'.format(x_values))
    print("Target: '{}'\n".format(y_value))
    
    X_train, X_test, y_train, y_test = train_test_split(df[x_values], df[y_value])
    df = pd.concat([X_train, y_train], axis=1)    
    downsampled = df.drop(df[df[y_value] == 0].sample(len(df[df[y_value] == 0]) - df[y_value].sum()).index)
    X_train = downsampled.drop(y_train.name, axis=1)
    y_train = downsampled[y_train.name]
    
    ss = StandardScaler()
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)
    
#     slr = LinearRegression()
#     slr.fit(X_train, y_train)
#     print('Linear Regression score is {}'.format(slr.score(X_test, y_test)))
 
#     ridge = Ridge()
#     ridge.fit(X_train, y_train)
#     print('Ridge Regression score is {}'.format(ridge.score(X_test, y_test)))
    
    logit = LogisticRegression()
    logit.fit(X_train, y_train)
    print('Logistic Regression score is {}'.format(logit.score(X_test, y_test)))
    
    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)
    print('Random Forest Classifier score is {}'.format(rfc.score(X_test, y_test)))
    
    xgboost = xgb.XGBClassifier(max_depth=5, silent=False)
    xgboost.fit(X_train, y_train)
    print('XGBoost score is {}'.format(xgboost.score(X_test, y_test)))

    print('Majority class proportion is {}\n'.format(pd.DataFrame(y_test.values)[0].value_counts()[0] / len(y_test)))
    
    # takes the higher scoring model and prints the confusion matrix using it as a predictor
    print(confusion_matrix(y_test, sorted(zip([logit.score(X_test, y_test), rfc.score(X_test, y_test)], [logit, rfc]), reverse=True)[0][1].predict(X_test)), '\n')
    
    print(classification_report(y_test, sorted(zip([logit.score(X_test, y_test), rfc.score(X_test, y_test)], [logit, rfc]), reverse=True)[0][1].predict(X_test)))


In [212]:
for toxicity in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'total_toxic', 'is_toxic', 'is_toxic_no_profanity']:
    tester(['question', 'exclamation', 'words', 'avg_word_len', 'caps_percentage'], toxicity)
    print('-'*75)

Predictors: ['question', 'exclamation', 'words', 'avg_word_len', 'caps_percentage']
Target: 'toxic'

Logistic Regression score is 0.769624188332038
Random Forest Classifier score is 0.6795447138165317
XGBoost score is 0.7121869280718028
Majority class proportion is 0.9043548023165442

[[29242  6830]
 [ 2359  1456]] 

             precision    recall  f1-score   support

          0       0.93      0.81      0.86     36072
          1       0.18      0.38      0.24      3815

avg / total       0.85      0.77      0.80     39887

---------------------------------------------------------------------------
Predictors: ['question', 'exclamation', 'words', 'avg_word_len', 'caps_percentage']
Target: 'severe_toxic'

Logistic Regression score is 0.8535111690525735
Random Forest Classifier score is 0.762955348860531
XGBoost score is 0.7821846716975456
Majority class proportion is 0.9902725198686289

[[33871  5628]
 [  215   173]] 

             precision    recall  f1-score   support

          

## testing using tfidf vectorizer

In [199]:
def nlpscorer(nlp):
    
    df = pd.read_csv('cleaned.csv')

    X_train, X_test, y_train, y_test = train_test_split(df[['comment_text_char_space', 'question', 'exclamation', 'words',
                                                            'avg_word_len', 'caps_percentage']], df['is_toxic_no_profanity'])
    
    df = pd.concat([X_train, y_train], axis=1)    
    downsampled = df.drop(df[df['is_toxic_no_profanity'] == 0].sample(len(df[df['is_toxic_no_profanity'] == 0]) - df['is_toxic_no_profanity'].sum()).index)
    X_train = downsampled.drop(y_train.name, axis=1)
    y_train = downsampled[y_train.name]
    
    nlp.fit(X_train.comment_text_char_space)
    transformed = nlp.transform(X_train.comment_text_char_space)
    X_train.drop('comment_text_char_space', axis=1).shape
    xgboost.fit(transformed, y_train)
    
    print('NLP ONLY')
    print(classification_report(y_test, xgboost.predict(nlp.transform(X_test.comment_text_char_space))))
    print(pd.DataFrame(confusion_matrix(y_test, xgboost.predict(nlp.transform(X_test.comment_text_char_space))), index=['true_0', 'true_1'], columns=['predicted_0', 'predicted_1']), '\n')
    print('NLP only score: ', xgboost.score(nlp.transform(X_test.comment_text_char_space), y_test), '\n')
    
    X_train_combined = sp.sparse.hstack([transformed, sp.sparse.csr_matrix(X_train.drop('comment_text_char_space', axis=1))])
    X_test_combined = sp.sparse.hstack([nlp.transform(X_test.comment_text_char_space), sp.sparse.csr_matrix(X_test.drop('comment_text_char_space', axis=1))])
    xgboost.fit(X_train_combined, y_train)
    
    print('COMBINED')
    print(classification_report(y_test, xgboost.predict(X_test_combined)))
    print(pd.DataFrame(confusion_matrix(y_test, xgboost.predict(X_test_combined)), index=['true_0', 'true_1'], columns=['predicted_0', 'predicted_1']), '\n')
    print('Combined score: ', xgboost.score(X_test_combined, y_test))

In [208]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_df=0.7)

In [209]:
nlpscorer(tfidf)

NLP ONLY

             precision    recall  f1-score   support

          0       0.97      0.95      0.96     35876
          1       0.60      0.70      0.64      4011

avg / total       0.93      0.92      0.92     39887

        predicted_0  predicted_1
true_0        33987         1889
true_1         1214         2797 

NLP only score:  0.9222052297741119 

COMBINED

             precision    recall  f1-score   support

          0       0.97      0.91      0.94     35876
          1       0.50      0.76      0.60      4011

avg / total       0.92      0.90      0.91     39887

        predicted_0  predicted_1
true_0        32778         3098
true_1          951         3060 

Combined score:  0.8984882292476245
