#Sample Script to Sort Emails
Source:https://towardsdatascience.com/how-i-used-machine-learning-to-classify-emails-and-turn-them-into-insights-efed37c1e66,https://towardsdatascience.com/how-i-used-machine-learning-to-classify-emails-and-turn-them-into-insights-part-2-6a8f26477c86 https://www.bogotobogo.com/python/NLTK/tf_idf_with_scikit-learn_NLTK.php


# Import packages and set pandas environment settings

In [None]:
import win32com.client, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction import text
import re
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
import seaborn as sns
import nltk
from nltk.stem import WordNetLemmatizer


sns.set_theme()


pd.set_option("display.max_rows", None, "display.max_columns", None,'max_colwidth',500)

# Inialize Outlook and access inbox

In [None]:
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
    
inbox = outlook.GetDefaultFolder(6) # "6" refers to the index of a folder - in this case,
                                        # the inbox. You can change that number to reference
                                        # any other folder
messages = inbox.Items
message = messages.GetFirst()

print(message.Categories)

# Interact inbox and return strings with email information and text

In [None]:
emaillist=[]

for message in messages:
    try:
        email= (message.EntryID,message.SenderEmailAddress,message.ReceivedTime,message.subject,message.body,message.Parent,"unknown")
        emaillist.append(email)
    except:
        pass


# Print raw email at index 1 to confirm script worked

In [None]:
print(emaillist[1])

# Create Pandas dataframe and load emaillist, drop empty data, and set date field to datetime type

In [None]:
pdraw = pd.DataFrame(emaillist,columns=['EmailID','From','Date','Subject','Body','Parent','Class'])
pdraw['Date']=pdraw['Date'].astype('datetime64[ns]')
pdraw.dropna(how='all', axis=1)
                        

# Process body of email
1: Split email and take top, ignoring replies
2: Remove ministry info, eg ENV:XX
3: Remove urls
4: Remove email addresses
5: Remove special characters
6: 
7: Remove extra spaces

In [None]:
pdraw['Body']=pdraw['Body'].str.split('\r\n\r\n \r\n\r\nFrom').str[0]
pdraw['Body']=pdraw['Body'].apply(lambda x:re.sub(r'\S*:\S*\s?', ' ',x))
pdraw['Body']=pdraw['Body'].apply(lambda x:re.sub(r'\S*-\S-\S*\s?', ' ',x))
pdraw['Body']=pdraw['Body'].apply(lambda x:re.sub(r'^https?:\/\/.*[\r\n]*', ' ',x))
pdraw['Body']=pdraw['Body'].apply(lambda x:re.sub(r'\S*@\S*\s?', ' ',x))
pdraw['Body']=pdraw['Body'].apply(lambda x:re.sub(r'[^a-zA-Z0-9]+', ' ',x))
pdraw['Body']=pdraw['Body'].apply(lambda x:re.sub("^\d+\s|\s\d+\s|\s\d+$", ' ',x))
pdraw['Body']=pdraw['Body'].apply(lambda x:re.sub(r' +', ' ',x))


In [None]:
pdraw.head(15)

# Create stop word list, vectorize words to determine frequency

In [None]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(WordNetLemmatizer().stem(item))
    return stems

In [None]:
stopwords = text.ENGLISH_STOP_WORDS.union(['deepa','brett','corey','jackie','gillian','auger','external','filatow','erwin'])

#dictionairy = ['gis','soil','course','map','job']

vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer='word',strip_accents='unicode')
   
x= vectorizer.fit_transform(pdraw['Body'])

# Graph results of vectorization

In [None]:
x_dense = x.todense()
coords = PCA(n_components=2).fit_transform(x_dense)

plt.scatter(coords[:,0], coords[:,1],c='m')

# Kmeans Cluster above scatter

In [None]:
k=3

kmeans = KMeans(n_clusters=k, random_state=0).fit(x_dense)     

y_means = kmeans.predict(x_dense)
scatter = plt.scatter(coords[:,0], coords[:,1],c=y_means)
plt.colorbar(scatter,spacing='uniform')
plt.rcParams['figure.dpi']=100



cluster_map = pd.DataFrame()
cluster_map['data_index']=pdraw.index.values
cluster_map['cluster']= kmeans.labels_



# Join Clustering to pdraw

In [None]:
pdcluster = pdraw.join(cluster_map,lsuffix=pdraw.index.values,rsuffix=cluster_map['data_index'])
pdcluster.sort_values(['cluster'])

# Define function to get most frequent words from each email

In [None]:
def top_tfidf_feats(row, features, top_n=20):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats, columns=['features', 'score'])
    return df

def top_feats_in_doc(X, features, row_id, top_n=25):
    row = np.squeeze(X[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

# Returns top X most frequent words

In [None]:
features = vectorizer.get_feature_names()

print (top_feats_in_doc(x, features, 1, 10))

# Define function to aggregate top words in all emails

In [None]:
def top_mean_feats(X, features,grp_ids=None, min_tfidf=0.1, top_n=25):
    if grp_ids:
        D = X[grp_ids].toarray()
    else:
        D = X.toarray()
        
    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

# Print top 50 results in all emails

In [None]:
print (top_mean_feats(x, features, top_n=50))

# Create new vectorizer to feed into cosine relationship, input query term, vectorize query term. Values that are closer to query will have higher result

In [None]:
query = 'bctw'

vec_query = vectorizer.transform([query])

cosine_sim = linear_kernel(vec_query,x_dense).flatten()

x=np.count_nonzero(cosine_sim)


In [None]:
print(cosine_sim)

# Print index of emails that are related to search term

In [None]:
related_email_indices = cosine_sim.argsort()[:-x:-1]
print(related_email_indices)

#  Create Dictionary with emailid and queryword

In [None]:
emaildict={}

for item in related_email_indices:
    dictvalue=pdraw.loc[item,'EmailID']
    emaildict[dictvalue]=query

In [None]:
for message in messages:
    for key in emaildict:
        if message.EntryID==key:
            message.Categories=emaildict.get(key)
            message.Save()



# For related emails, print details

In [None]:
for emails in related_email_indices:
    print(pdraw.values[emails])