This is the main notebook.
the supervised ML part of the project is all here.

In [128]:
import random
import re
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

In [None]:
mails_df = pd.read_csv("../data/training_data.csv", sep='~', index_col=0)
imputer = SimpleImputer(strategy='constant', fill_value='')
mails_df = pd.DataFrame(imputer.fit_transform(mails_df), columns=mails_df.columns)
# mails_df.iloc[:5]

In [130]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [131]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    processed_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    return processed_text


def preprocess_sender(address):
    address_lst = address.lower().split('@')
    address_lst[1] = re.sub('[.]ac|[.]in|[.]com', '', address_lst[1])
    address_lst[1] = re.sub('[.]', ' ', address_lst[1])
    address_lst[0] = re.sub('[._]', '', address_lst[0])
    return ' '.join(address_lst)

preprocessor = ColumnTransformer(transformers=[
    ('subject', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.9, min_df=0.005), 'subject'),
    ('body', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.8, min_df=0.01), 'body'),
    ('sender', TfidfVectorizer(preprocessor=preprocess_sender), 'sender')
],
    # transformer_weights={
    #     'subject': 2,
    #     'body': 1,
    #     'sender': 4
    # },
    remainder='drop')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('svd', TruncatedSVD(n_components=1000))
])

Loading labels

In [132]:
from sklearn.preprocessing import MultiLabelBinarizer
import json

with open('../data/label_dict.json', 'r') as file:
    all_labels = json.load(file)

Encoding labels

In [None]:
label_list = [key for key in all_labels.keys() if re.match('Label_[0-9]', key)]
mlb = MultiLabelBinarizer(classes=label_list)
labels_array = [list(st.split(',')) for st in mails_df['labels']]
mlb.fit(label_list)
labels = mlb.transform(labels_array)

In [134]:
feature_matrix = pipeline.fit_transform(mails_df)

In [135]:
X = feature_matrix
y = labels

In [136]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lst = [X_train, X_test, y_train, y_test]
for i in lst:
    print(i.shape, '\n')

(838, 2106) 

(360, 2106) 

(838, 21) 

(360, 21) 



My main metric for evaluation is precision_score. It is best suited for the multi-output classification. Its details as mentioned in its documentation


> Compute the precision.
> The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.
>The best value is 1 and the worst value is 0.

as sizes of all labels are different, and they are very imbalanced, so I've used average='weighted'

> Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.



In [137]:

def try_different_models(models):
    dfs = []
    for name, model in models:
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_pred_on_train = clf.predict(X_train)

        score = pd.Series(
            [
                accuracy_score(y_test, y_pred), accuracy_score(y_train, y_pred_on_train),
                precision_score(y_test, y_pred, average='weighted', zero_division=0),
                precision_score(y_train, y_pred_on_train, average='weighted', zero_division=0)
            ],
            index=['test_accuracy', 'train_accuracy', 'precision_score', 'precision_score_train'],
            name=name
        )
        dfs.append(score)

    print('shape of dataset: ', X.shape, y.shape)
    score_df = pd.DataFrame(dfs)
    best_model_name = score_df['precision_score'].idxmax()
    score_df.sort_values(by=['precision_score', 'test_accuracy'], ascending=False, inplace=True)
    print(score_df)
    return dict(models).get(best_model_name)


In [139]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier

models = [
    ('DTree', DecisionTreeClassifier(random_state=42, class_weight='balanced')),
    ('ETree', ExtraTreeClassifier(random_state=42, class_weight='balanced')),
    ('ETrees', ExtraTreesClassifier(random_state=42, class_weight='balanced')),
    ('RF', RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')),
    ('KNN', KNeighborsClassifier()),
    ('SVM', MultiOutputClassifier(SVC(random_state=42, class_weight='balanced'), n_jobs=-1)),
    ('XGB', XGBClassifier(class_weight='balanced')),
    # ('GaussianNB', GaussianNB()),
    # ('MultinomialNB', MultinomialNB())
]

try_different_models(models)

Parameters: { "class_weight" } are not used.

shape of dataset:  (1198, 2106) (1198, 21)
        test_accuracy  train_accuracy  precision_score  precision_score_train
RF           0.755556        0.998807         0.961111               1.000000
ETrees       0.777778        0.997613         0.936383               0.997311
XGB          0.841667        0.997613         0.928790               1.000000
SVM          0.838889        0.966587         0.915008               0.964353
KNN          0.800000        0.834129         0.849182               0.887626
DTree        0.744444        0.997613         0.737069               0.997311
ETree        0.633333        0.997613         0.585801               0.997311


SVM seems to be the best classifier among these, as other models are overfitting the training data.

tuning hyperparameters using grid search

In [140]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import make_scorer, precision_score
from time import time

# param_grid = [
#     {
#         'preprocessor__subject__max_df': [0.8, 0.7],
#         'preprocessor__subject__min_df': [0.005, 0.001],
#         'preprocessor__body__max_df': [0.8, 0.7],
#         'preprocessor__body__min_df': [0.05, 0.01],
#         'model__estimator__C': [0.01, 1, 100],
#         'model__estimator__kernel': ['poly'],
#     },
#     {
#         'model_estimator__C': [0.01, 0.1, 1, 10, 100, 1000],
#         'model_estimator__kernel': ['rbf'],
#         'model_estimator__gamma': [0.1, 0.01, 0.001, 0.0001, 'scale'],
#         'model_estimator__class_weight': ['balanced', None]
#     },
#     {
#         'model_estimator__C': [0.01, 1, 100, 1000],
#         'model_estimator__kernel': ['linear', 'poly', 'sigmoid'],
#         'model_estimator__class_weight': ['balanced', None]
#     }
# ]

param_grid = {
    'estimator__C': [0.01, 1, 0.001],
    'estimator__kernel': ['poly', 'linear'],
}

param_gird_expanded = ParameterGrid(param_grid)

In [141]:
model = MultiOutputClassifier(SVC(class_weight='balanced'))
clf = model.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_score(y_test, y_pred, average='weighted', zero_division=0)

0.9150076631394022

In [142]:
models2 = []
i=1
for parameters in list(param_gird_expanded):
    base_model = MultiOutputClassifier(SVC(class_weight='balanced'), n_jobs=-1)
    models2.append((f'{i}', base_model.set_params(**parameters)))
    i += 1

best_model = try_different_models(models2)

shape of dataset:  (1198, 2106) (1198, 21)
   test_accuracy  train_accuracy  precision_score  precision_score_train
3       0.805556        0.982100         0.911034               0.978831
4       0.852778        0.977327         0.896022               0.973799
2       0.683333        0.687351         0.815450               0.824325
1       0.000000        0.000000         0.201181               0.213948
6       0.000000        0.000000         0.040113               0.060096
5       0.000000        0.000000         0.021222               0.027826
