In [85]:
import re

import joblib
import pandas as pd
from sklearn.impute import SimpleImputer
import dill

mails_df = pd.read_csv("../data/training_data.csv", sep='~', index_col=0)
imputer = SimpleImputer(strategy='constant', fill_value='')
mails_df = pd.DataFrame(imputer.fit_transform(mails_df), columns=mails_df.columns)
mails_df.iloc[:10]

Unnamed: 0,id,sender,receiver,subject,body,labels
0,186eb43f4ea9acd5,irdc@iitr.ac.in,faculty-notices@iitr.ac.in,Video of Prof. Gaurav Raheja in conversation w...,"Dear all, Forwarded below is the google drive ...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
1,186ea63122cc80be,gensec-sports@iitr.ac.in,students-notices@iitr.ac.in,Fwd: Institute Open Championship,"Dear Students,聽Since there were many requests ...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
2,186ea54adf777368,office_cdc@iitr.ac.in,students-notices@iitr.ac.in,Study and Scholarship opportunities in USA,"Dear all, As you already know CDC is organizin...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
3,186ea3792fd6d577,aao-ug@iitr.ac.in,aakash_ks@mfs.iitr.ac.in,Re: Request for semester 1 UG academic gradesheet,Dear Student Please come in Academic office fo...,"IMPORTANT,CATEGORY_PERSONAL,INBOX"
4,186ea315fc860fd9,spicmacay@iitr.ac.in,"students-notices@iitr.ac.in,fa.culsoc@iitr.ac.in",SPIC MACAY Intro Talk & Quiz,Namaste!! Do you know that the recent event of...,"UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
5,186ea2ce3178f943,rajendrabhawan@iitr.ac.in,rajendrabhawan.students2023@iitr.ac.in,Washing room facilities,"Dear students, There are a lot of wash clothes...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
6,186e9ffeb0537c35,awards@iitr.ac.in,students-notices@iitr.ac.in,Notice for Final call of Convocation Awards 20...,"Dear students, We are pleased to inform you th...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
7,186e9f90f0f2a1a1,hec@iitr.ac.in,students-notices@iitr.ac.in,Tehri lake trip.,"┬Ā Dear Students & Scholars,┬Ā ┬Ā HEC IITR is ...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
8,186e9d89c99d2ef5,gensec.technical@iitr.ac.in,ug-1st-year@iitr.ac.in,Recruitment assignments at MDG Space,Attention development and design enthusiasts 📣...,"UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
9,186e9a0b1cfdde79,gensec.technical@iitr.ac.in,"ug-1st-year@iitr.ac.in,ug-2nd-year@iitr.ac.in",Recruitment Talk and Ask-Me-Anything session b...,Hello Freshers & Sophomores !Have you ever won...,"UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"


In [86]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import re

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

In [87]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    processed_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    return processed_text


def preprocess_sender(address):
    address = address.lower()
    address = re.sub('[.]ac[.]in|[.]com', '', address)
    address = re.sub('@|[.]', ' ', address)
    return address

subject_tfidf = TfidfVectorizer(preprocessor=preprocess_text, min_df=0.01)
body_tfidf = TfidfVectorizer(preprocessor=preprocess_text, max_df=0.9, min_df=0.05)
sender_tfidf = TfidfVectorizer(preprocessor=preprocess_sender)

subject_vectors = subject_tfidf.fit_transform(mails_df['subject'])
body_vectors = body_tfidf.fit_transform(mails_df['body'])
sender_vectors = sender_tfidf.fit_transform(mails_df['sender'])

In [88]:
from scipy.sparse import hstack

feature_matrix = hstack((subject_vectors, body_vectors, sender_vectors))

In [89]:
type(feature_matrix)

scipy.sparse._csr.csr_matrix

In [90]:
preprocess_sender('hello@cs.iitr.ac.in')

'hello cs iitr'

In [91]:
from sklearn.preprocessing import MultiLabelBinarizer
import json

with open('../data/label_dict.json', 'r') as file:
    all_labels = json.load(file)
label_list = [key for key in all_labels.keys() if re.match('Label_[0-9]', key)]
mlb = MultiLabelBinarizer(classes=label_list)
labels_array = [list(st.split(',')) for st in mails_df['labels']]
mlb.fit(label_list)
labels = mlb.transform(labels_array)



In [92]:
X = feature_matrix
y = labels
print(X.shape, y.shape)

(1126, 588) (1126, 21)


In [93]:
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

k = 25
model = KMeans(n_clusters=k, random_state=42)
labels = model.fit_predict(X.toarray())
labels



array([20, 24, 20, ..., 24,  3, 10])

In [94]:
mails_df['label'] = pd.Series(labels)

In [95]:
new_df = mails_df.loc[:,['id', 'subject', 'label', 'labels']]

vectorizers = [sender_tfidf, body_tfidf, subject_tfidf]
dill.dump(vectorizers, open('../data/TfidfVectorizers.pkl', 'wb'))

vectorizers = dill.load(open('../data/TfidfVectorizers.pkl', 'rb'))

In [96]:
new_df.reset_index(inplace=True)

In [97]:
new_df.sort_values(by=['label', 'index'], inplace=True)

In [98]:
new_df

Unnamed: 0,index,id,subject,label,labels
54,54,186cf6816cc9ee8b,Re-ETE Grades for MAN 001,0,"Label_218404524901403708,IMPORTANT,CATEGORY_PE..."
115,115,186a7db4b059dfb8,Assignments of MAN-006,0,"IMPORTANT,CATEGORY_PERSONAL,INBOX"
123,123,186a2bacfff2ba09,Re-Exam/Second Exam ETE MAN 001,0,"UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
217,217,1866cca1cc62a095,Re: MAN 001 Marks,0,"Label_218404524901403708,IMPORTANT,CATEGORY_PE..."
218,218,1866cc82f2a67e7e,MAN 001 Marks,0,"Label_218404524901403708,IMPORTANT,CATEGORY_PE..."
...,...,...,...,...,...
1031,1031,1847258cc1a6281c,Get weekly insights from the entrepreneurial w...,24,"IMPORTANT,CATEGORY_PERSONAL,INBOX"
1037,1037,1846beda7f62cb45,Launch of Appetizer,24,"IMPORTANT,CATEGORY_PERSONAL,INBOX"
1095,1095,184527434d8b1d80,Get weekly insights from the entrepreneurial w...,24,"IMPORTANT,CATEGORY_PERSONAL,INBOX"
1097,1097,184523b9e7477dff,E-sports IITR | Walk-in Game Event For UG 1st ...,24,"IMPORTANT,CATEGORY_PERSONAL,INBOX"


In [83]:
new_df.loc[new_df['label'] == 1]

Unnamed: 0,index,subject,label
30,30,Get weekly insights from the entrepreneurial w...,1
102,102,Get weekly insights from the entrepreneurial w...,1
112,112,Extension of last date : International Women D...,1
114,114,"Congratulations to Prof Kirtiraj , Prof Vimal ...",1
154,154,International Women Day 2023 _ poster Competit...,1
173,173,Class 09 Program: Grab your first round of VC ...,1
242,242,Invitation to E-Summit'23,1
243,243,Exciting networking opportunity from NPCI,1
248,248,Last reminder to participate in MUN 2023,1
249,249,Register for Startup Expo (Hurry up!!),1


In [108]:
new_df

Unnamed: 0,index,id,subject,label,labels,label_list
54,54,186cf6816cc9ee8b,Re-ETE Grades for MAN 001,0,"Label_218404524901403708,IMPORTANT,CATEGORY_PE...",[MTE]
115,115,186a7db4b059dfb8,Assignments of MAN-006,0,"IMPORTANT,CATEGORY_PERSONAL,INBOX",[]
123,123,186a2bacfff2ba09,Re-Exam/Second Exam ETE MAN 001,0,"UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX",[]
217,217,1866cca1cc62a095,Re: MAN 001 Marks,0,"Label_218404524901403708,IMPORTANT,CATEGORY_PE...",[MTE]
218,218,1866cc82f2a67e7e,MAN 001 Marks,0,"Label_218404524901403708,IMPORTANT,CATEGORY_PE...",[MTE]
...,...,...,...,...,...,...
1031,1031,1847258cc1a6281c,Get weekly insights from the entrepreneurial w...,24,"IMPORTANT,CATEGORY_PERSONAL,INBOX",[]
1037,1037,1846beda7f62cb45,Launch of Appetizer,24,"IMPORTANT,CATEGORY_PERSONAL,INBOX",[]
1095,1095,184527434d8b1d80,Get weekly insights from the entrepreneurial w...,24,"IMPORTANT,CATEGORY_PERSONAL,INBOX",[]
1097,1097,184523b9e7477dff,E-sports IITR | Walk-in Game Event For UG 1st ...,24,"IMPORTANT,CATEGORY_PERSONAL,INBOX",[]


In [113]:
new_df.loc[new_df['label'] == 0].label_list.value_counts()

[]             20
[MTE]          13
[Quiz]          9
[MTE, Quiz]     1
Name: label_list, dtype: int64

In [114]:
model.transform(X)

array([[2.08809653, 1.88604784, 1.73690043, ..., 2.1109709 , 2.0765076 ,
        1.78580069],
       [2.06183201, 1.9004083 , 1.70122235, ..., 2.12321288, 1.96241714,
        1.40985237],
       [1.78325749, 1.60971438, 1.41514806, ..., 1.93445369, 1.83495538,
        1.49368978],
       ...,
       [2.09922945, 1.93739829, 1.79519219, ..., 2.16487967, 2.13293046,
        1.73230863],
       [2.00442461, 1.92648414, 1.73258335, ..., 2.17104832, 2.07960258,
        1.82278854],
       [2.03859383, 1.89065532, 1.69112506, ..., 2.13628436, 2.06090977,
        1.75805116]])