In [6]:
import re

import joblib
import pandas as pd
import scipy.sparse
from sklearn.impute import SimpleImputer

mails_df = pd.read_csv("../data/training_data.csv", sep='~', index_col=0)
imputer = SimpleImputer(strategy='constant', fill_value='')
mails_df = pd.DataFrame(imputer.fit_transform(mails_df), columns=mails_df.columns)
# mails_df = mails_df.iloc[:10]

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import re

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

In [50]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    processed_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    return processed_text


def preprocess_sender(address):
    address_lst = address.lower().split('@')
    address_lst[1] = re.sub('[.]ac|[.]in|[.]com', '', address_lst[1])
    address_lst[1] = re.sub('[.]', ' ', address_lst[1])
    address_lst[0] = re.sub('[._]', '', address_lst[0])
    return ' '.join(address_lst)

preprocessor = ColumnTransformer(transformers=[
    ('subject', TfidfVectorizer(preprocessor=preprocess_text, min_df=0.01), 'subject'),
    ('body', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.9, min_df=0.1), 'body'),
    ('sender', TfidfVectorizer(preprocessor=preprocess_sender), 'sender')
], remainder='drop')

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [53]:
preprocess_sender('hello_me@cs.iitr.ac.in')

'hellome cs iitr'

In [54]:
prepeprocessed_df = pipeline.fit_transform(mails_df)
prepeprocessed_df

<1126x381 sparse matrix of type '<class 'numpy.float64'>'
	with 26770 stored elements in Compressed Sparse Row format>

In [55]:
pipeline.get_feature_names_out()

array(['subject__academic', 'subject__appointment', 'subject__assignment',
       'subject__autumn', 'subject__award', 'subject__bhawan',
       'subject__call', 'subject__campus', 'subject__celebration',
       'subject__cen', 'subject__ceremony', 'subject__club',
       'subject__committee', 'subject__competition', 'subject__council',
       'subject__course', 'subject__cultural', 'subject__day',
       'subject__design', 'subject__development', 'subject__dr',
       'subject__email', 'subject__ete', 'subject__event',
       'subject__examination', 'subject__excluding', 'subject__feb',
       'subject__first', 'subject__form', 'subject__fwd',
       'subject__gentle', 'subject__grade', 'subject__guest',
       'subject__hsn', 'subject__iit', 'subject__iitr', 'subject__indian',
       'subject__institute', 'subject__inter', 'subject__intro',
       'subject__invitation', 'subject__january', 'subject__last',
       'subject__lecture', 'subject__list', 'subject__man',
       'subject__m

In [22]:
import scipy
pp1 = pipeline.transform(mails_df.iloc[8:])
type(pp1) == scipy.sparse.csr_matrix

True

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
import json

with open('../data/label_dict.json', 'r') as file:
    all_labels = json.load(file)
label_list = [key for key in all_labels.keys() if re.match('Label_[0-9]', key)]
mlb = MultiLabelBinarizer(classes=label_list)
labels_array = [list(st.split(',')) for st in mails_df['labels']]
mlb.fit(label_list)
labels = mlb.transform(labels_array)



In [14]:
feature_matrix = pipeline.fit_transform(mails_df)
feature_matrix

<1126x397 sparse matrix of type '<class 'numpy.float64'>'
	with 27059 stored elements in Compressed Sparse Row format>

In [15]:
X = feature_matrix
y = labels
print(X.shape, y.shape)

(1126, 397) (1126, 21)


In [16]:
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

k = 25
model = KMeans(n_clusters=k, random_state=42)
labels = model.fit_predict(X)
labels



array([16, 22,  1, ...,  1, 21, 18])

In [17]:
type(labels)

numpy.ndarray

In [18]:
mails_df['label'] = pd.Series(labels, name='label')

In [19]:
mails_df.loc[:,['subject', 'label']]

Unnamed: 0,subject,label
0,Video of Prof. Gaurav Raheja in conversation w...,16
1,Fwd: Institute Open Championship,22
2,Study and Scholarship opportunities in USA,1
3,Re: Request for semester 1 UG academic gradesheet,17
4,SPIC MACAY Intro Talk & Quiz,19
...,...,...
1121,Prof R J Garde Endowment Lecture,23
1122,Updates: Your List of Quarantined Emails since...,6
1123,You have been added to a team in Microsoft Teams,1
1124,Re: Regarding the updated mobile no. and room no.,21
