In [14]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os, re, nltk, csv, random, string, operator, gensim, warnings, logging, pickle

from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus.reader.wordnet import WordNetError

#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')
#stopwords = stopwords.words('english')

from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel

from sklearn import preprocessing, linear_model, naive_bayes, metrics, svm, ensemble
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import io

from google.colab import files
uploaded = files.upload()

#pyLDAvis.enable_notebook()
#warnings.filterwarnings('ignore')
#%matplotlib inline

#logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
#logging.root.level = logging.INFO 

Saving email_dump.csv to email_dump.csv


In [0]:
# df = pd.read_csv("C:\Users\GA00612119\Desktop\ML\email_dump.csv", encoding = 'unicode_escape')
df = pd.read_csv(io.BytesIO(uploaded['email_dump.csv']),encoding = 'unicode_escape')

In [18]:
df

Unnamed: 0,Subject,Category
0,Daily Buzz : Last Date for Data Science Hackat...,Learning Bytes
1,International Women's Day 2019 | March 8th Cel...,Social
2,Happy Women's Day!!,Social
3,Second Level Escalation email: MFG NDA-Submission,Important
4,Happy Women's Day,Social
5,#BalanceForBetter Happy Women's Day,Social
6,Reminder: Assigned activity NAD 101 to be comp...,Important
7,Learning Assignment Reminder: Rise with Dice D...,Important
8,Reminder: Assigned activity Automation Enginee...,Important
9,Reminder: Assigned activity Code of Ethical Bu...,Important


In [0]:
train_x, valid_x, train_y, valid_y = train_test_split(df['Subject'], df['Category'],train_size = 0.8,random_state = 1)

In [20]:
tf_vect = TfidfVectorizer(max_df = 1.0, min_df = 1)
tf_vect.fit(df['Subject'])

train_x_tf =  tf_vect.transform(train_x)
valid_x_tf =  tf_vect.transform(valid_x)

encoder = preprocessing.LabelEncoder()
train_y_count = encoder.fit_transform(train_y)
valid_y_count = encoder.fit_transform(valid_y)

train_y_map = dict(zip(encoder.transform(encoder.classes_),encoder.classes_))
valid_y_map = dict(zip(encoder.transform(encoder.classes_),encoder.classes_))

print('train_y_map :',train_y_map,)
print('valid_y_map :',valid_y_map)


train_y_map : {0: 'Important', 1: 'Learning Bytes', 2: 'Social'}
valid_y_map : {0: 'Important', 1: 'Learning Bytes', 2: 'Social'}


In [0]:
grid_params = {'n_neighbors': [3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41],\
              'metric': ['euclidean', 'manhattan'] }

In [0]:
gs = GridSearchCV(KNeighborsClassifier(),\
                  grid_params,\
                  verbose = 1,\
                  cv = 3,\
                  n_jobs = -1)

In [23]:
gs.fit(train_x_tf, train_y_count)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    3.1s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23,
                                         25, 27, 29, 31, 33, 35, 37, 39, 41]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [0]:
pred = gs.predict(valid_x_tf)

In [0]:
acc = accuracy_score(pred, valid_y_count)*100 

In [26]:
print ("********GRID SEARCH RESULTS********")

print("GS Best Score: " + str(gs.best_score_))
print("GS Best Estimator: " + str(gs.best_estimator_))
print("GS Best Params: " + str(gs.best_params_))


********GRID SEARCH RESULTS********
GS Best Score: 0.9096509240246407
GS Best Estimator: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
GS Best Params: {'metric': 'euclidean', 'n_neighbors': 3}


In [27]:
print("********MODEL RESULTS*********")
print("Accuracy: " + str(acc) + " %")
print("------------------------------------------------------------------")
print("Confusion Matrix: ")
print(confusion_matrix(valid_y_count, pred))
print("------------------------------------------------------------------")
print("Classification Report: ") 
print(classification_report(valid_y_count, pred))
print("------------------------------------------------------------------")

********MODEL RESULTS*********
Accuracy: 87.70491803278688 %
------------------------------------------------------------------
Confusion Matrix: 
[[54  1  1]
 [ 4 43  1]
 [ 2  6 10]]
------------------------------------------------------------------
Classification Report: 
              precision    recall  f1-score   support

           0       0.90      0.96      0.93        56
           1       0.86      0.90      0.88        48
           2       0.83      0.56      0.67        18

    accuracy                           0.88       122
   macro avg       0.86      0.81      0.83       122
weighted avg       0.87      0.88      0.87       122

------------------------------------------------------------------
