In [None]:
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text as sk_text
import string
from sklearn.model_selection import train_test_split

from sklearn import tree, metrics    # Decision Tree
from sklearn.linear_model import LogisticRegression # LogisticRegression
from sklearn.svm import SVC # Support Vector Machine 


In [None]:
%%time
#reading in only 500k records
toxic_comments_df = pd.read_csv('train.csv',nrows=500000)
toxic_comments_df =  toxic_comments_df[['id','target','comment_text']]


In [None]:
#creating our truth column. comment is toxic if target >= .5
toxic_comments_df['truth'] = toxic_comments_df.target.apply(lambda x: 1 if x>=.5 else 0 ).astype('float32')
toxic_comments_df.head()


In [None]:
toxic_comments_df['InputData'] = toxic_comments_df['comment_text'].str.replace('[{}]'.format(string.punctuation), '')
toxic_comments_df['InputData'] = toxic_comments_df['InputData'].apply(lambda x: " ".join([ word.lower() if word[0].isalpha() else ""  for word in str(x).split()]))

toxic_comments_df.head()

In [None]:
vectorizerInput = sk_text.CountVectorizer(#max_features = 10000,
                             min_df=.002, 
                             #max_df=.75,
                            stop_words=['id','truth']
                            )
matrix = vectorizerInput.fit_transform(toxic_comments_df.InputData.values)
print("# of features", len(vectorizerInput.get_feature_names()))


In [None]:
%%time
df_text = pd.DataFrame(matrix.todense(), index=toxic_comments_df.id, columns=vectorizerInput.get_feature_names()).astype('float32')

dataFrameWithHashTagHandlesAndTruths = pd.merge(df_text,  toxic_comments_df[['id','truth']].copy(), on='id').astype('float32')


In [None]:
%%time
X = dataFrameWithHashTagHandlesAndTruths.drop(['id','truth'], axis=1)
y = dataFrameWithHashTagHandlesAndTruths.truth.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Logistic Regression ##

In [None]:
%%time
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
best_scores = []
best_models = []
for current_solver in solvers:
        logreg = LogisticRegression(solver=current_solver)
        logreg.fit(X_train, y_train)
        y_pred = logreg.predict(X_test)
        best_scores.append(metrics.f1_score(y_test, y_pred, average='weighted'))
        best_models.append(current_solver)
        

best_solver = max(best_scores)

print('Best Scores:')    
print(best_scores)
print()
print('Best Models:')
print(best_models)
print()
print('Overall Best Score:')
print(best_solver)

logreg = LogisticRegression(solver=best_solver)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))


## SVM ##

In [None]:
%%time
clf = SVC(C=1.0, gamma='auto') 
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))

## Decision Tree ##

In [None]:
%%time
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]))

## Neural Network ##

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
scaler = StandardScaler()
scaler.fit(X_train)
X_train_nn = scaler.transform(X_train)
X_test_nn = scaler.transform(X_test)

In [None]:
%%time
mlp = MLPClassifier(hidden_layer_sizes=(1000,1000), solver = 'adam', max_iter=1000)
mlp.fit(X_train_nn,y_train)
y_pred = mlp.predict(X_test_nn)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]))