In [1]:
import pandas as pd
import numpy as np
import multiprocessing
import warnings
warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk
import re
import string

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
set(stopwords.words('english'))

In [3]:
files=['../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv',
       '../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv',
       '../input/jigsaw-unintended-bias-in-toxicity-classification/all_data.csv'
      ]

def load_data(file):
    return pd.read_csv(file)
with multiprocessing.Pool() as pool:
    test,train,all_data=pool.map(load_data,files)

In [4]:
train.info()

In [5]:
train.head(5)

In [6]:
train.target.value_counts(dropna=True).head(10)

In [7]:
train.shape

In [8]:
train['target'].isnull().sum()

In [9]:
train['target'].head()

In [10]:
X=train[['comment_text','target']]
train.columns.values

In [12]:
del train

In [13]:
X.head(5)

In [14]:
tox=0
neut=0
no_of_rows=X.shape[0]
for row in range(no_of_rows):
    if X['target'][row]>0.7:
        tox+=1
    else:
        neut+=1

In [15]:
print(f'{round((tox*100)/no_of_rows,3)}% data contains toxic comments')
print(f'{round((neut*100/no_of_rows),3)}% data contains neutral comments')

# **Preprocessing comment_text for training**

In [16]:
# remove all numbers with letters attached to them
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

# '[%s]' % re.escape(string.punctuation),' ' - replace punctuation with white space
# .lower() - convert all strings to lowercase 
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

# Remove all '\n' in the string and replace it with a space
remove_n = lambda x: re.sub("\n", " ", x)

# Remove all non-ascii characters 
remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]',r' ', x)

# Apply all the lambda functions wrote previously through .map on the comments column
X['comment_text'] = X['comment_text'].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)

# Handling Class Imbalance

In [19]:
toxic_train=X[X['target']>0.7].iloc[0:45451,:]
toxic_train.shape

In [34]:
neutral_train=X[X['target']<=0.7].iloc[0:150000,:]
neutral_train.shape

In [21]:
neutral_train.head(5)

In [22]:
balanced_train=pd.concat([toxic_train,neutral_train],axis=0)
balanced_train.shape

In [23]:
balanced_train['target']

In [24]:
del toxic_train, neutral_train

In [25]:
# Import packages for pre-processing
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel

# Import tools to split data and evaluate model performance
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve, fbeta_score, confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve

# Import ML algos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Model F1 Score Comparison 

In [26]:
'''
vectorizer values: CountVectorizer, TfidfVectorizer
'''
def cv_tf_train_test(df_done,label,vectorizer,ngram):
    print('start')
    ''' Train/Test split'''
    # Split the data into X and y data sets
    X = df_done.comment_text
    y = df_done[label]

    # Split our data into training and test data 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    print('#2')
    ''' Count Vectorizer/TF-IDF '''

    # Create a Vectorizer object and remove stopwords from the table
    cv1 = vectorizer(ngram_range=(ngram), stop_words='english')
    print('#3')
    X_train_cv1 = cv1.fit_transform(X_train) # Learn the vocabulary dictionary and return term-document matrix
    X_test_cv1  = cv1.transform(X_test)      # Learn a vocabulary dictionary of all tokens in the raw documents.
    print('#4')
        
    ''' Initialize all model objects and fit the models on the training data '''
    
    lr = LogisticRegression()
    lr.fit(X_train_cv1, y_train)
    print(classification_report(lr.predict(X_test_cv1), y_test))
    
    
#     print('lr done')

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_cv1, y_train)
    print(classification_report(knn.predict(X_test_cv1), y_test))
    
    
    print('knn done')

    
    
    
    svm_model = LinearSVC()
    svm_model.fit(X_train_cv1, y_train)
    print(classification_report(svm_model.predict(X_test_cv1), y_test))
    
    
    print('svm done')
    randomforest = RandomForestClassifier(n_estimators=30, random_state=42)
    randomforest.fit(X_train_cv1, y_train)
    print(classification_report(randomforest.predict(X_test_cv1), y_test))
    print('rdf done')
    
    
    df_acc = pd.DataFrame(accuracy_data, index=['Log Regression','KNN', 'SVM', 'Random Forest'])
    return df_f1


# Assigning Binary Value to Labels

In [27]:
balanced_train['target']=np.where(balanced_train['target']>0.7,1.0,0.0)
balanced_train.head()

In [30]:
import time

t0 = time.time()
# balanced_train = balanced_train.iloc[0:10000]
df_tox_cv = cv_tf_train_test(balanced_train, 'target', TfidfVectorizer, (1,1))
df_tox_cv.rename(columns={'F1 Score': 'F1 Score(target)'}, inplace=True)

t1 = time.time()

total = 'Time taken: {} seconds'.format(t1-t0)
print(total)

df_tox_cv