In [4]:
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text as sk_text
import string
from sklearn.model_selection import train_test_split

from sklearn import tree, metrics    # Decision Tree
from sklearn.linear_model import LogisticRegression # LogisticRegression
from sklearn.svm import SVC # Support Vector Machine 


In [5]:
%%time
toxic_comments_df = pd.read_csv('train.csv')
toxic_comments_df =  toxic_comments_df.drop(['article_id','parent_id'],axis=1)
toxic_comments_df = toxic_comments_df.head(10000)

CPU times: user 14.7 s, sys: 3.03 s, total: 17.8 s
Wall time: 16.9 s


In [6]:
toxic_comments_df['truth'] = toxic_comments_df.target.apply(lambda x: 1 if x>=.5 else 0 )
toxic_comments_df.head()


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,truth
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,rejected,0,0,0,0,0,0.0,0,4,0
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,rejected,0,0,0,0,0,0.0,0,4,0
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,rejected,0,0,0,0,0,0.0,0,4,0
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,rejected,0,0,0,0,0,0.0,0,4,0
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,rejected,0,0,0,1,0,0.0,4,47,1


In [7]:
toxic_comments_df['InputData'] = toxic_comments_df['comment_text'].str.replace('[{}]'.format(string.punctuation), '')
toxic_comments_df['InputData'] = toxic_comments_df['InputData'].apply(lambda x: " ".join([ word if word[0].isalpha() else ""  for word in str(x).split()]))

toxic_comments_df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,truth,InputData
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0.0,0,4,0,This is so cool Its like would you want your m...
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0.0,0,4,0,Thank you This would make my life a lot less a...
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0.0,0,4,0,This is such an urgent design problem kudos to...
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0.0,0,4,0,Is this something Ill be able to install on my...
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,0,0,0,1,0,0.0,4,47,1,haha you guys are a bunch of losers


In [8]:
prepareSKText = pd.DataFrame({'User_id': toxic_comments_df.id, 'All_words': toxic_comments_df.InputData})

vectorizerInput = sk_text.CountVectorizer(#max_features = 10000,
                             min_df=.001, 
                             #max_df=.75,
                            stop_words=['id','truth']
                            )
matrix = vectorizerInput.fit_transform(prepareSKText.All_words.values)
print(len(vectorizerInput.get_feature_names()))


4604


In [9]:
%%time
tdidf = matrix.toarray()
df_text = pd.DataFrame(matrix.todense(), index=toxic_comments_df.id, columns=vectorizerInput.get_feature_names())
new = toxic_comments_df[['id','truth']].copy()
dataFrameWithHashTagHandlesAndTruths = pd.merge(df_text, new, on='id')


CPU times: user 1.12 s, sys: 1.13 s, total: 2.25 s
Wall time: 4.01 s


In [10]:
%%time
X = dataFrameWithHashTagHandlesAndTruths.drop(['id','truth'], axis=1)
y = dataFrameWithHashTagHandlesAndTruths.truth.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8000, 4604)
(8000,)
(2000, 4604)
(2000,)
CPU times: user 1.19 s, sys: 952 ms, total: 2.14 s
Wall time: 1.72 s


## Logistic Regression ##

In [11]:
%%time
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
best_scores = []
best_models = []
for current_solver in solvers:
        logreg = LogisticRegression(solver=current_solver)
        logreg.fit(X_train, y_train)
        y_pred = logreg.predict(X_test)
        best_scores.append(metrics.f1_score(y_test, y_pred, average='weighted'))
        best_models.append(current_solver)
        

best_solver = max(best_scores)

print('Best Scores:')    
print(best_scores)
print()
print('Best Models:')
print(best_models)
print()
print('Overall Best Score:')
print(best_solver)

print()

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))




Best Scores:
[0.9309148825065274, 0.9309148825065274, 0.9309148825065274, 0.9295205924909381, 0.9241084777291673]

Best Models:
['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

Overall Best Score:
0.9309148825065274

0.9252323717948718
0.9425
0.9241084777291673
[[1871   10]
 [ 105   14]]
CPU times: user 1min 59s, sys: 4.79 s, total: 2min 4s
Wall time: 1min 15s




In [None]:
best_models

In [12]:
clf = SVC(C=1.0, gamma='auto') 
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))

NameError: name 'SVC' is not defined

## Decision Tree ##

In [None]:
%%time
## Decision tree boilerplate
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]))

In [None]:
print(vectorizerInput.get_feature_names())