In [1]:
import re

import joblib
import pandas as pd
from sklearn.impute import SimpleImputer

mails_df = pd.read_csv("../data/training_data.csv", sep='~', index_col=0)
imputer = SimpleImputer(strategy='constant', fill_value='')
mails_df = pd.DataFrame(imputer.fit_transform(mails_df), columns=mails_df.columns)

mails_df['body'] = mails_df['body'].map(lambda x: re.sub(r'[\s~]+', ' ', x).strip())
mails_df.iloc[:5]

Unnamed: 0,id,sender,receiver,subject,body,labels
0,186d6c11c13386b5,daa@iitr.ac.in,"students-notices@iitr.ac.in,staff-notices@iitr...",Invitation for the 2nd Institute Research Day ...,"Dear Students, Colleagues and All, Our Institu...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
1,186d6ab4f4407316,ccf@iitr.ac.in,"students-notices@iitr.ac.in,staff-notices@iitr...",Request to drive carefully,"Dear campus residents, On Sunday night we foun...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
2,186d5e299ff960aa,gensec.technical@iitr.ac.in,ug-1st-year@iitr.ac.in,VLG IITR Core Team Recruitments,Greetings!Missed our recruitment test? No worr...,"IMPORTANT,CATEGORY_PERSONAL,INBOX"
3,186d5295b533468f,gensec.technical@iitr.ac.in,"ug-1st-year@iitr.ac.in,ug-2nd-year@iitr.ac.in",QCG Recruitments 2023 | 18th March,"Greetings,QCG recruitments are around the corn...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"
4,186d4dec8b166664,nss@iitr.ac.in,"students-notices@iitr.ac.in,staff-notices@iitr...",Invitation for Valedictory Ceremony || Nationa...,"Dear All, Warm Greetings from Team National So...","UNREAD,IMPORTANT,CATEGORY_PERSONAL,INBOX"


In [2]:

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize the text
    tokens = text.split()
    # Remove stop words
    # tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize the words
    lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet.VERB) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet.NOUN) for token in lemmatized_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet.ADJ) for token in lemmatized_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet.ADV) for token in lemmatized_tokens]
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.8, min_df=0.2)),
    ('svd', TruncatedSVD(n_components=50))
])

feature_matrix = pipeline.fit_transform(mails_df['body'])



In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import re
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

# Define a function to preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    processed_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    return processed_text

# Define a sample DataFrame with email data
data = pd.DataFrame({'subject': ['Regarding our meeting'],
                     'body': ['Dear John, I hope this email finds you well. I wanted to follow up on our conversation from last week. As we discussed, I am interested in learning more about the services your company offers. Please let me know if there is any additional information you can provide. Thank you for your time. Best regards, Jane'],
                     'sender': ['jane@example.com']})

# Define the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('subject', TfidfVectorizer(preprocessor=preprocess_text, min_df=0.1), 'subject'),
    ('body', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.8, min_df=0.1), 'body'),
    ('sender', TfidfVectorizer(ngram_range=(1, 2), lowercase=False), 'sender')
], remainder='drop')

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svd', TruncatedSVD(n_components=100))
])


In [5]:
from sklearn.preprocessing import MultiLabelBinarizer
import json

with open('../data/label_dict.json', 'r') as file:
    all_labels = json.load(file)

In [6]:
label_list = [key for key in all_labels.keys() if re.match('Label_[0-9]', key)]
mlb = MultiLabelBinarizer(classes=label_list)
labels_array = [list(st.split(',')) for st in mails_df['labels']]
mlb.fit(label_list)
labels = mlb.transform(labels_array)



In [7]:
X = pd.DataFrame(mails_df)
y = pd.DataFrame(labels)
print(X.shape, y.shape)

(10, 6) (10, 21)


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lst = [X_train, X_test, y_train, y_test]
for i in lst:
    print(i.shape, '\n')


(8, 6) 

(2, 6) 

(8, 21) 

(2, 21) 



In [9]:
joblib.dump(preprocess_text, '../data/preprocess_text.pkl')

['../data/preprocess_text.pkl']

In [10]:
feature_matrix = pipeline.fit_transform(X_train)

In [None]:
pipeline.get_feature_names_out

In [13]:
import joblib
joblib.dump(pipeline, '../data/pipeline.pkl')

['../data/pipeline.pkl']

In [14]:
pipeline = joblib.load('../data/pipeline.pkl')

In [15]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

binary_rel_clf = BinaryRelevance(classifier=SVC())
binary_rel_clf.fit(X_train, y_train)
br_predictions = binary_rel_clf.predict(X_test)
from sklearn.metrics import accuracy_score

accuracy_score(y_test, br_predictions)

TypeError: no supported conversion for types: (dtype('O'),)

In [16]:

# to check how much the model is overfitting
test_prediction = binary_rel_clf.predict(X_train)
accuracy_score(y_train, test_prediction)

AttributeError: 'BinaryRelevance' object has no attribute 'model_count_'

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_predictions = knn_clf.predict(X_test)
accuracy_score(y_test, knn_predictions)

In [None]:
knn_train_pr = knn_clf.predict(X_train)
accuracy_score(y_train, knn_train_pr)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
knn_predictions = model.predict(X_test)
accuracy_score(y_test, knn_predictions)