# Abusive email classifier project

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

Load the data in

In [3]:
data = pd.read_csv('/Users/barak/Downloads/train (1).csv')
X = data.comment_text
y = data.iloc[:,2:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

Vectorize the data

In [5]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
vectorised_train_data = vectorizer.fit_transform(X_train)
vectorised_test_data = vectorizer.transform(X_test)

## Strategy 1: One-Vs-Rest SVM Classifier

In [7]:
classifier = OneVsRestClassifier(SVC(kernel='linear'))
classifier.fit(vectorised_train_data, y_train)

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

Make predictions

In [8]:
predictions = classifier.predict(vectorised_test_data)

In [9]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')

In [10]:
print("Accuracy = {}".format(accuracy))
print("Precision = {}".format(precision))
print("Recall = {}".format(recall))
print("F1 = {}".format(f1))

Accuracy = 0.9201412098930482
Precision = 0.8610288176676542
Recall = 0.6076791484508649
F1 = 0.7125027858257188


## Strategy 2: DNN on the Bag of Words Emails

In [25]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import RMSprop

In [18]:
vectorised_test_data.shape

(47872, 152259)

In [21]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=vectorised_train_data.shape[1]))
model.add(Dropout(.5))
model.add(Dense(64))
model.add(Dropout(.5))
model.add(Dense(y_train.shape[1], activation='sigmoid'))


In [27]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [28]:
model.fit(vectorised_train_data, y_train,
          epochs=20,
          batch_size=128)b
score = model.evaluate(vectorised_test_data, y_test, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [29]:
score

[0.06685063293711706, 0.9803329798627027]

In [30]:
nn_preds = model.predict(vectorised_test_data)

In [37]:
nn_preds_rounded = (nn_preds > .5).astype(int) 

In [39]:
accuracy = accuracy_score(y_test, nn_preds_rounded)
precision = precision_score(y_test, nn_preds_rounded, average='micro')
recall = recall_score(y_test, nn_preds_rounded, average='micro')
f1 = f1_score(y_test, nn_preds_rounded, average='micro')
print("Accuracy = {}".format(accuracy))
print("Precision = {}".format(precision))
print("Recall = {}".format(recall))
print("F1 = {}".format(f1))

Accuracy = 0.9134984959893048
Precision = 0.8019581112901227
Recall = 0.6149971488310207
F1 = 0.696143295142811


## Strategy 3: ConvNet

In [44]:
# Word-level model
max_len = max(len(X[i]) for i in range(X.shape[0]))
print ("Max length = {}".format(max_len))

Max length = 5000


In [59]:
wordslist = " ".join(X).split()
wordslist = list(set(wordslist))
print (wordslist[:100], len(wordslist))

['Accent', 'Flake..', 'shadow,', '"Rebecca,', '[I’m', '""know""', 'namespace)', 'BALONEY', 'A6nn', 'CPC-ML', 'extent.', 'nicer.)', 'Wikipedia:Cat', 'Hidari-Mitsudomoe', 'Hamlets', 'Scype.', 'subproject,', 'advocate..', 'Controversies.""', 'Franca;', 'cosmologies', "Brooke's", '05-30-2009', 'COMMUNISM""', 'thing,SPAIN', 'Ronaldihno', '210.50.232.183', '(duh', 'Muniz', '""enormous', 'routing.', 'Wulfings', 'MUFFIN', 'produces.', 'bisexual', 'TOTAL........£19,336.62', 'thirty-odd', '1098', "eachother's", 'orthodoxy."', 'elsewhen', 'swamps', 'Babylonians', 'asserts', "(hG/c^3)^1/2'd'(Vacuum", 'Bachcell', 'non-chronological', 'Unblocking/Blocking', 'place?...', 'airliner.', 'жарки', 'Revenoor', 'nationalists?', '""trolls"".', 'baited,', '""...To', "SAAMI's", '28385S)-SR-3.', '4872', '202.63.42.221,', 'socialist)?', 'dip-stick', 'shooting/murdering', 'Kanaan', '""Mushrik"",', 'userfy', 'Special:Contributions/Avraham', 'topped', 'posterity.', '""Daniel""', 'SaltyPig.', '11:00,', 'Sterling""',

In [60]:
word_indices = dict((c, i) for i, c in enumerate(wordslist))

In [65]:
word_indices['UNK'] = -1
word_indices['cat']

175778

In [66]:
maxlen = 500
X_seq = np.zeros((len(X), maxlen))
for i, msg in enumerate(X):
    for t, word in enumerate(msg):
        if t < maxlen:
            try:
                X_seq[i, t] = word_indices[word]
            except KeyError:
                X_seq[i, t] = -1
        else:
            continue

In [68]:
X_seq[5]

array([ 4.39995e+05, -1.00000e+00, -1.00000e+00,  1.42011e+05,
        3.07468e+05,  1.32184e+05,  2.32736e+05,  2.76581e+05,
        9.43900e+03,  2.90868e+05,  1.56370e+04,  5.09856e+05,
        9.43900e+03,  2.90868e+05,  1.05493e+05,  3.07468e+05,
        1.32184e+05,  6.75400e+04, -1.00000e+00,  3.83766e+05,
        2.76581e+05,  3.07468e+05,  5.07010e+04, -1.00000e+00,
        5.07010e+04,  1.90264e+05, -1.00000e+00,  9.43900e+03,
        6.75400e+04, -1.00000e+00,  2.46804e+05,  1.90264e+05,
        5.09856e+05,  5.09856e+05,  4.92200e+04, -1.00000e+00,
        1.56370e+04,  6.75400e+04,  1.90264e+05, -1.00000e+00,
        2.90868e+05,  4.39577e+05,  1.90264e+05, -1.00000e+00,
        2.90868e+05,  3.07468e+05,  3.07468e+05,  5.09856e+05,
        6.75400e+04, -1.00000e+00,  2.46804e+05,  1.90264e+05,
        5.09856e+05,  5.09856e+05,  2.55008e+05, -1.00000e+00,
       -1.00000e+00,  2.21036e+05, -1.00000e+00,  2.90868e+05,
        9.43900e+03,  5.09856e+05,  6.48980e+04, -1.000

## Strategy 4: Sequence Model