This is the main notebook.
the supervised ML part of the project is all here.

In [54]:
import random
import re
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

In [55]:
mails_df = pd.read_csv("../data/training_data.csv", sep='~', index_col=0)
imputer = SimpleImputer(strategy='constant', fill_value='')
mails_df = pd.DataFrame(imputer.fit_transform(mails_df), columns=mails_df.columns)
mails_df.iloc[:5]

Unnamed: 0,id,sender,receiver,subject,body,labels
0,1871339da28c50be,cognizance@iitr.ac.in,"students-notices@iitr.ac.in,staff-notices@iitr...",Cognizance 2023 is Live Now - Join the Excitem...,"Greetings everyone,We are excited to announce ...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
1,187129b4701da92f,dosw@iitr.ac.in,students-notices@iitr.ac.in,Opportunity to the Researchers to write techni...,"Dear Students, Following is the link to the ab...","IMPORTANT,CATEGORY_PERSONAL,INBOX"
2,1871298cfa0ea9df,dosw@iitr.ac.in,students-notices@iitr.ac.in,Advisory to reach your Bhawan by 1 am on March...,"Dear Students, As you are aware, there are man...","UNREAD,IMPORTANT,Label_9,CATEGORY_PERSONAL,INBOX"
3,1871203b08c898ff,gensec.cult@iitr.ac.in,students-notices@iitr.ac.in,"Annual Art Exhibition - ""दर्पण"" | Fine Arts | ...","Greetings!Fine Arts Section, IIT Roorkee is re...","IMPORTANT,Label_7,CATEGORY_PERSONAL,INBOX"
4,18711da44ef7c918,ccf@iitr.ac.in,"students-notices@iitr.ac.in,staff-notices@iitr...",CCF stall with fun games and merchandise in AB...,Dear Campus CommunityCommittee for Campus Faun...,"UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"


In [56]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [57]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    processed_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    return processed_text


def preprocess_sender(address):
    address_lst = address.lower().split('@')
    address_lst[1] = re.sub('[.]ac|[.]in|[.]com', '', address_lst[1])
    address_lst[1] = re.sub('[.]', ' ', address_lst[1])
    address_lst[0] = re.sub('[._]', '', address_lst[0])
    return ' '.join(address_lst)

preprocessor = ColumnTransformer(transformers=[
    ('subject', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.9, min_df=0.005), 'subject'),
    ('body', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.8, min_df=0.01), 'body'),
    ('sender', TfidfVectorizer(preprocessor=preprocess_sender), 'sender')
],
    # transformer_weights={
    #     'subject': 2,
    #     'body': 1,
    #     'sender': 4
    # },
    remainder='drop')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('svd', TruncatedSVD(n_components=1000))
])

Loading labels

In [58]:
from sklearn.preprocessing import MultiLabelBinarizer
import json

with open('../data/label_dict.json', 'r') as file:
    all_labels = json.load(file)

Encoding labels

In [59]:
label_list = [key for key in all_labels.keys() if re.match('Label_[0-9]', key)]
mlb = MultiLabelBinarizer(classes=label_list)
labels_array = [list(st.split(',')) for st in mails_df['labels']]
mlb.fit(label_list)
labels = mlb.transform(labels_array)



In [60]:
feature_matrix = pipeline.fit_transform(mails_df)

In [61]:
X = feature_matrix
y = labels

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lst = [X_train, X_test, y_train, y_test]
for i in lst:
    print(i.shape, '\n')

(838, 2106) 

(360, 2106) 

(838, 21) 

(360, 21) 



My main metric for evaluation is precision_score. It is best suited for the multi-output classification. Its details as mentioned in its documentation


> Compute the precision.
> The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.
>The best value is 1 and the worst value is 0.

as sizes of all labels are different, and they are very imbalanced, so I've used average='weighted'

> Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.



In [63]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from collections import defaultdict

models = [
    ('DTree', DecisionTreeClassifier(random_state=42)),
    ('ETree', ExtraTreeClassifier(random_state=42)),
    ('ETrees', ExtraTreesClassifier(random_state=42)),
    ('RF', RandomForestClassifier(random_state=42)),
    ('KNN', KNeighborsClassifier()),
    ('SVM', MultiOutputClassifier(SVC(random_state=42), n_jobs=-1)),
    ('XGB', XGBClassifier()),
    # ('GaussianNB', GaussianNB()),
    # ('MultinomialNB', MultinomialNB())
]

def try_different_models(models):
    dfs = []
    for name, model in models:
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_pred_on_train = clf.predict(X_train)

        score = pd.Series(
            [
                accuracy_score(y_test, y_pred), accuracy_score(y_train, y_pred_on_train),
                precision_score(y_test, y_pred, average='weighted', zero_division=0),
                precision_score(y_train, y_pred_on_train, average='weighted', zero_division=0)
            ],
            index=['test_accuracy', 'train_accuracy', 'precision_score', 'precision_score_train'],
            name=name
        )
        dfs.append(score)

    print('shape of dataset: ', X.shape, y.shape)
    print(pd.DataFrame(dfs))


try_different_models(models)

shape of dataset:  (1198, 2106) (1198, 21)
        test_accuracy  train_accuracy  precision_score  precision_score_train
DTree        0.811111        0.998807         0.831408               1.000000
ETree        0.725000        0.998807         0.709497               1.000000
ETrees       0.830556        0.998807         0.938317               1.000000
RF           0.802778        0.998807         0.941427               1.000000
KNN          0.800000        0.834129         0.849182               0.887626
SVM          0.822222        0.921241         0.937338               0.991534
XGB          0.841667        0.997613         0.928790               1.000000


SVM seems to be the best classifier among these

tuning hyperparameters using grid search

In [64]:
from sklearn.linear_model import SGDClassifier
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
m = 12000
C = 0.01

try_different_models(
    [
        # ('SVC', MultiOutputClassifier(SGDClassifier(loss="hinge", alpha=1/(m*C)))),
        ('SVC0', MultiOutputClassifier(SVC(), n_jobs=-1))
    ]
)

shape of dataset:  (1198, 2106) (1198, 21)
      test_accuracy  train_accuracy  precision_score  precision_score_train
SVC0       0.822222        0.921241         0.937338               0.991534
