In [None]:
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text as sk_text
import string
from sklearn.model_selection import train_test_split

from sklearn import tree, metrics    # Decision Tree
from sklearn.linear_model import LogisticRegression # LogisticRegression


In [None]:
%%time
toxic_comments_df = pd.read_csv('train.csv')
toxic_comments_df =  toxic_comments_df.drop(['article_id','parent_id'],axis=1)
toxic_comments_df = toxic_comments_df.head(10000)

In [None]:
toxic_comments_df['truth'] = toxic_comments_df.target.apply(lambda x: 1 if x>=.5 else 0 )
toxic_comments_df.head()


In [None]:
toxic_comments_df['InputData'] = toxic_comments_df['comment_text'].str.replace('[{}]'.format(string.punctuation), '')
toxic_comments_df['InputData'] = toxic_comments_df['InputData'].apply(lambda x: " ".join([ word if word[0].isalpha() else ""  for word in str(x).split()]))

toxic_comments_df.head()

In [None]:
prepareSKText = pd.DataFrame({'User_id': toxic_comments_df.id, 'All_words': toxic_comments_df.InputData})

vectorizerInput = sk_text.CountVectorizer(#max_features = 10000,
                             min_df=.001, 
                             #max_df=.75,
                            stop_words=['id','truth']
                            )
matrix = vectorizerInput.fit_transform(prepareSKText.All_words.values)
print(len(vectorizerInput.get_feature_names()))


In [None]:
%%time
tdidf = matrix.toarray()
df_text = pd.DataFrame(matrix.todense(), index=toxic_comments_df.id, columns=vectorizerInput.get_feature_names())
new = toxic_comments_df[['id','truth']].copy()
dataFrameWithHashTagHandlesAndTruths = pd.merge(df_text, new, on='id')


In [None]:
%%time
X = dataFrameWithHashTagHandlesAndTruths.drop(['id','truth'], axis=1)
y = dataFrameWithHashTagHandlesAndTruths.truth.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Logistic Regression ##

In [None]:
%%time
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
best_scores = []
best_models = []
for current_solver in solvers:
        logreg = LogisticRegression(solver=current_solver)
        logreg.fit(X_train, y_train)
        y_pred = logreg.predict(X_test)
        best_scores.append(metrics.f1_score(y_test, y_pred, average='weighted'))
        best_models.append(current_solver)
        

best_solver = max(best_scores)

print('Best Scores:')    
print(best_scores)
print()
print('Best Models:')
print(best_models)
print()
print('Overall Best Score:')
print(best_solver)

print()

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))


In [None]:
best_models

## Decision Tree ##

In [None]:
%%time
## Decision tree boilerplate
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]))

In [None]:
print(vectorizerInput.get_feature_names())