In [65]:
import random
import re
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

mails_df = pd.read_csv("../data/training_data.csv", sep='~', index_col=0)
imputer = SimpleImputer(strategy='constant', fill_value='')
mails_df = pd.DataFrame(imputer.fit_transform(mails_df), columns=mails_df.columns)
# mails_df.iloc[:5]

In [66]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [78]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    processed_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    return processed_text


def preprocess_sender(address):
    address_lst = address.lower().split('@')
    address_lst[1] = re.sub('[.]ac|[.]in|[.]com', '', address_lst[1])
    address_lst[1] = re.sub('[.]', ' ', address_lst[1])
    address_lst[0] = re.sub('[._]', '', address_lst[0])
    return ' '.join(address_lst)

preprocessor = ColumnTransformer(transformers=[
    ('subject', TfidfVectorizer(preprocessor=preprocess_text, min_df=0.01), 'subject'),
    ('body', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.8, min_df=0.01), 'body'),
    ('sender', TfidfVectorizer(preprocessor=preprocess_sender), 'sender')
], remainder='drop')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('svd', TruncatedSVD(n_components=500))
])

In [79]:
from sklearn.preprocessing import MultiLabelBinarizer
import json

with open('../data/label_dict.json', 'r') as file:
    all_labels = json.load(file)

In [80]:
label_list = [key for key in all_labels.keys() if re.match('Label_[0-9]', key)]
mlb = MultiLabelBinarizer(classes=label_list)
labels_array = [list(st.split(',')) for st in mails_df['labels']]
mlb.fit(label_list)
labels = mlb.transform(labels_array)



In [81]:
feature_matrix = pipeline.fit_transform(mails_df)

In [82]:
feature_matrix

<1198x1861 sparse matrix of type '<class 'numpy.float64'>'
	with 75856 stored elements in Compressed Sparse Row format>

In [83]:
X = feature_matrix
y = labels
print(X.shape, y.shape)

(1198, 1861) (1198, 21)


In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lst = [X_train, X_test, y_train, y_test]
for i in lst:
    print(i.shape, '\n')


(838, 1861) 

(360, 1861) 

(838, 21) 

(360, 21) 



In [102]:
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import RadiusNeighborsClassifier

def try_different_models(X_train, y_train, X_test, y_test):

    models = [
        ('DTree', DecisionTreeClassifier()),
        ('ETree', ExtraTreeClassifier()),
        ('ETrees', ExtraTreesClassifier()),
        ('RF', RandomForestClassifier()),
        ('KNN', KNeighborsClassifier()),
        ('SVM', MultiOutputClassifier(SVC())),
        ('XGB', XGBClassifier())
    ]

    for name, model in models:
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_validation = clf.predict(X_train)
        print(name)
        print(accuracy_score(y_test, y_pred), '\t', accuracy_score(y_train, y_validation))


try_different_models(X_train, y_train, X_test, y_test)

DTree
0.7666666666666667 	 0.9988066825775657
ETree
0.7055555555555556 	 0.9988066825775657
ETrees
0.8277777777777777 	 0.9988066825775657
RF
0.8138888888888889 	 0.9988066825775657
KNN
0.8194444444444444 	 0.8365155131264916
SVM
0.8388888888888889 	 0.9164677804295943
XGB
0.825 	 0.9988066825775657


KNN and SVM are reliable classifiers

In [103]:
models = [
    ('KNN', KNeighborsClassifier()),
    ('SVM', MultiOutputClassifier(SVC()))
]

for name, model in models:
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_validation = clf.predict(X_train)
    print(name)
    print(classification_report(y_test, y_pred))

KNN
              precision    recall  f1-score   support

           0       0.69      0.79      0.73        14
           1       0.50      0.33      0.40        12
           2       0.00      0.00      0.00         3
           3       0.96      0.81      0.88        57
           4       0.97      0.88      0.92        32
           5       1.00      0.88      0.93         8
           6       0.92      1.00      0.96        11
           7       0.75      1.00      0.86         3
           8       0.67      0.29      0.40         7
           9       0.74      0.91      0.82        22
          10       1.00      0.33      0.50        12
          11       1.00      1.00      1.00         5
          12       0.88      1.00      0.93         7
          13       0.83      0.36      0.50        14
          14       1.00      1.00      1.00         1
          15       1.00      1.00      1.00         4
          16       1.00      1.00      1.00        13
          17       0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM
              precision    recall  f1-score   support

           0       1.00      0.71      0.83        14
           1       0.83      0.42      0.56        12
           2       0.00      0.00      0.00         3
           3       1.00      0.81      0.89        57
           4       1.00      0.88      0.93        32
           5       1.00      0.88      0.93         8
           6       1.00      1.00      1.00        11
           7       1.00      1.00      1.00         3
           8       1.00      0.43      0.60         7
           9       0.68      0.86      0.76        22
          10       1.00      0.33      0.50        12
          11       1.00      1.00      1.00         5
          12       0.88      1.00      0.93         7
          13       1.00      0.36      0.53        14
          14       0.00      0.00      0.00         1
          15       1.00      1.00      1.00         4
          16       1.00      1.00      1.00        13
          17       0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
