In [10]:
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text as sk_text
import string
from sklearn.model_selection import train_test_split

from sklearn import tree, metrics    # Decision Tree
from sklearn.linear_model import LogisticRegression # LogisticRegression
from sklearn.svm import SVC # Support Vector Machine 


In [2]:
%%time
#reading in only 500k records
toxic_comments_df = pd.read_csv('train.csv',nrows=100000)
toxic_comments_df =  toxic_comments_df[['id','target','comment_text']]


Wall time: 1.49 s


In [3]:
#creating our truth column. comment is toxic if target >= .5
toxic_comments_df['truth'] = toxic_comments_df.target.apply(lambda x: 1 if x>=.5 else 0 ).astype('float32')
toxic_comments_df.head()


Unnamed: 0,id,target,comment_text,truth
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0
3,59855,0.0,Is this something I'll be able to install on m...,0.0
4,59856,0.893617,haha you guys are a bunch of losers.,1.0


In [4]:
toxic_comments_df['InputData'] = toxic_comments_df['comment_text'].str.replace('[{}]'.format(string.punctuation), '')
toxic_comments_df['InputData'] = toxic_comments_df['InputData'].apply(lambda x: " ".join([ word.lower() if word[0].isalpha() else ""  for word in str(x).split()]))

toxic_comments_df.head()

Unnamed: 0,id,target,comment_text,truth,InputData
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,this is so cool its like would you want your m...
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,thank you this would make my life a lot less a...
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,this is such an urgent design problem kudos to...
3,59855,0.0,Is this something I'll be able to install on m...,0.0,is this something ill be able to install on my...
4,59856,0.893617,haha you guys are a bunch of losers.,1.0,haha you guys are a bunch of losers


In [5]:
vectorizerInput = sk_text.CountVectorizer(#max_features = 10000,
                             min_df=.001, 
                             #max_df=.75,
                            stop_words=['id','truth']
                            )
matrix = vectorizerInput.fit_transform(toxic_comments_df.InputData.values)
print("# of features", len(vectorizerInput.get_feature_names()))


# of features 3929


In [6]:
%%time
df_text = pd.DataFrame(matrix.todense(), index=toxic_comments_df.id, columns=vectorizerInput.get_feature_names()).astype('float32')

dataFrameWithHashTagHandlesAndTruths = pd.merge(df_text,  toxic_comments_df[['id','truth']].copy(), on='id').astype('float32')


Wall time: 11 s


In [7]:
%%time
X = dataFrameWithHashTagHandlesAndTruths.drop(['id','truth'], axis=1)
y = dataFrameWithHashTagHandlesAndTruths.truth.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(80000, 3929)
(80000,)
(20000, 3929)
(20000,)
Wall time: 1.91 s


## Logistic Regression ##

In [19]:
%%time
solvers = ['newton-cg', 'liblinear', 'sag', 'saga']
best_scores = []
best_models = []
for current_solver in solvers:
        logreg = LogisticRegression(solver=current_solver)
        logreg.fit(X_train, y_train)
        y_pred = logreg.predict(X_test)
        best_scores.append(metrics.f1_score(y_test, y_pred, average='weighted'))
        best_models.append(current_solver)
        
best_solver = solvers[best_scores.index(max(best_scores))]


print('Best Scores:')    
print(best_scores)
print()
print('Best Models:')
print(best_models)
print()
print('Overall Best Score:')
print(best_solver)

logreg = LogisticRegression(solver=best_solver)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))




Best Scores:
[0.9328402646445451, 0.9328402646445451, 0.9324825891633807, 0.9324519729932954]

Best Models:
['newton-cg', 'liblinear', 'sag', 'saga']

Overall Best Score:
newton-cg
0.9310666074332825
0.94175
0.9328402646445451
[[18407   256]
 [  909   428]]
Wall time: 13min 50s


## SVM ##

In [None]:
%%time
potenial_c = [.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0]
best_scores = []
best_C = []
for c in potenial_c:
    clf = SVC(C=c, gamma='auto') 
    clf.fit(X_train, y_train) 
    y_pred = clf.predict(X_test)
    best_scores.append(metrics.f1_score(y_test, y_pred, average='weighted'))
    best_C.append(c)

    
best_c_val = potenial_c[best_scores.index(max(best_scores))]

print('Best Scores:')    
print(best_scores)
print('Best C:')
print(best_C)
print('Overall Best Score:')
print(best_c_val)git

clf = SVC(C=best_c_val, gamma='auto') 
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)
print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))

  'precision', 'predicted', average, warn_for)


## Decision Tree ##

In [21]:
%%time
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]))

0.9185051448310593
0.92075
0.919597999088646
[[17911   752]
 [  833   504]]
Wall time: 5min 12s


## Neural Network ##

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
scaler = StandardScaler()
scaler.fit(X_train)
X_train_nn = scaler.transform(X_train)
X_test_nn = scaler.transform(X_test)

  return self.partial_fit(X, y)
  """
  


In [23]:
%%time
mlp = MLPClassifier(hidden_layer_sizes=(1000), solver = 'adam')
mlp.fit(X_train_nn,y_train)
y_pred = mlp.predict(X_test_nn)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]))

0.9332687807003315
0.94285
0.9352216392593429
[[18386   277]
 [  866   471]]
Wall time: 35min 11s
