In [83]:
import re

import joblib
import pandas as pd
from sklearn.impute import SimpleImputer

mails_df = pd.read_csv("../data/training_data.csv", sep='~', index_col=0)
imputer = SimpleImputer(strategy='constant', fill_value='')
mails_df = pd.DataFrame(imputer.fit_transform(mails_df), columns=mails_df.columns)
# mails_df.iloc[:5]

In [65]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import re
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

In [72]:

# Define a function to preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    processed_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    return processed_text


# Define the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('subject', TfidfVectorizer(preprocessor=preprocess_text, min_df=0.01), 'subject'),
    ('body', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.8, min_df=0.05), 'body'),
    ('sender', TfidfVectorizer(ngram_range=(1, 2), lowercase=False), 'sender')
], remainder='drop')

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor)
    # ('svd', TruncatedSVD(n_components=100))
])


In [67]:
from sklearn.preprocessing import MultiLabelBinarizer
import json

with open('../data/label_dict.json', 'r') as file:
    all_labels = json.load(file)

In [68]:
label_list = [key for key in all_labels.keys() if re.match('Label_[0-9]', key)]
mlb = MultiLabelBinarizer(classes=label_list)
labels_array = [list(st.split(',')) for st in mails_df['labels']]
mlb.fit(label_list)
labels = mlb.transform(labels_array)



In [73]:
feature_matrix = pipeline.fit_transform(mails_df)

In [74]:
feature_matrix

<1126x837 sparse matrix of type '<class 'numpy.float64'>'
	with 48002 stored elements in Compressed Sparse Row format>

In [75]:
X = feature_matrix
y = labels
print(X.shape, y.shape)

(1126, 837) (1126, 21)


In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lst = [X_train, X_test, y_train, y_test]
for i in lst:
    print(i.shape, '\n')


(900, 837) 

(226, 837) 

(900, 21) 

(226, 21) 



In [78]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

binary_rel_clf = MultiOutputClassifier(RandomForestClassifier())
binary_rel_clf.fit(X_train, y_train)
br_predictions = binary_rel_clf.predict(X_test)
from sklearn.metrics import accuracy_score

accuracy_score(y_test, br_predictions)

0.9380530973451328

In [79]:

# to check how much the model is overfitting
test_prediction = binary_rel_clf.predict(X_train)
accuracy_score(y_train, test_prediction)

1.0

In [80]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_predictions = knn_clf.predict(X_test)
accuracy_score(y_test, knn_predictions)

0.915929203539823

In [81]:
knn_train_pr = knn_clf.predict(X_train)
accuracy_score(y_train, knn_train_pr)

0.9522222222222222

In [82]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
knn_predictions = model.predict(X_test)
accuracy_score(y_test, knn_predictions)

0.9469026548672567