# Directory Watcher

This code watches a given directory, whenever a new email file is created, it loads the trained model to predict it's output class. Email is then moved to the output class folder and this entry is appended to csv file of processed emails.

(TBD: Use DB instead of csv)

Files required in same directory are
- glove.6B.50d.txt
- model.json
- best_model.h5
- processedemails.csv

 ## Initial Setup

In [1]:
import numpy as np
import pandas as pd
import re
import csv
# NLP
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from nltk.stem.porter import PorterStemmer

from keras.layers import *
from keras.models import Sequential
from keras.models import model_from_json

import sys
import os
import shutil
import time
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Cleaning and vectorization of Email

In [2]:
def clean(text):
    stop = set(stopwords.words('english'))
    stop.update(("to","cc","subject","http","from", "gbp", "usd", "eur", "cad", "sent","thanks", "acc", "ID", "account", "regards", "hi", "hello", "thank you"))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    porter= PorterStemmer()
    
    text=text.rstrip()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    stem = " ".join(porter.stem(token) for token in normalized.split())
    
    return normalized

embeddings = {}
with open('./glove.6B.50d.txt',encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:],dtype='float32')

        embeddings[word] = coeffs
    f.close()
print(len(embeddings))

def getOutputEmbeddings(X):  
    X = X.split()
    embedding_matrix_output = np.zeros((1,100,50))
    for jx in range(min(100, len(X))):
        embedding_matrix_output[0][jx] = embeddings[X[jx].lower()]
            
    return embedding_matrix_output


400000


## Class Labels used for Model Training

In [3]:
#dependent on model loaded
classes = ['BankFailed', 'BankProgress', 'BankComplete', 'BankRequest',
       'ClientProgress', 'ClientStatus', 'ClientComplete', 'ClientFailed']

## Email Processing

In [4]:
def HandleNewEmail(mail_path):
    #Open and read file in 1 string
    emails=open(mail_path, "r")
    test_str = ""
    for line in emails:
        test_str=test_str + line
    print(f"Original input --> {test_str}") 
    emails.close()
    #clean email
    clean_test_str = clean(test_str)
    print(f"Cleaned input --> {clean_test_str}")
    emb_X = getOutputEmbeddings(clean_test_str)
    #load model
    with open("model.json", "r") as file:
        model=model_from_json(file.read())
    model.load_weights("best_model.h5")
    #model.summary()
    p = model.predict_classes(emb_X)
    #print (p.shape)
    print(f'Output --> class {classes[p[0]]} ');
    
    #add email content and predicted class to processedemails.csv
    fields=[test_str, classes[p[0]]]
    with open(r'processedemails.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow(fields)
    f.close()
    
    moveEmail(mail_path, classes[p[0]])
    print("\n\n")
    
def moveEmail(mail_path, outputdir):
    #Check if output class directory exists, if not, create it
    CHECK_FOLDER = os.path.isdir(outputdir)
    if not CHECK_FOLDER:
        os.makedirs(outputdir)
        print("created folder : ", outputdir)  
    #move email to class output directory
    shutil.move(mail_path, outputdir)
    


## Directory Watcher

In [6]:
def on_created(event):
    print(f"New email {event.src_path} received!")
    HandleNewEmail(event.src_path)

if __name__ == "__main__":
    patterns = "*"
    ignore_patterns = ""
    ignore_directories = False
    case_sensitive = True
    my_event_handler = PatternMatchingEventHandler(patterns, ignore_patterns, ignore_directories, case_sensitive)
    my_event_handler.on_created = on_created
    #new emails will be created in inputEmails directory
    path = "inputEmails"
    #path = sys.argv[1] if len(sys.argv) > 1 else 'inputEmails'
    go_recursively = False
    my_observer = Observer()
    my_observer.schedule(my_event_handler, path, recursive=go_recursively)
    my_observer.start()
    print('====> Observer Started')
    try:
        while True:
             time.sleep(1)
    except KeyboardInterrupt:
        my_observer.stop()
        print('====> Observer Stopped')
        my_observer.join()

====> Observer Started
New email inputEmails\testemail.txt received!
Original input --> Payment of 471862128 CAD to account id 101165 has been made on 19/02/2020 and is in progress.
please acknowledge.
Thanks!

Cleaned input --> payment id made progress please acknowledge
Instructions for updating:
Colocations handled automatically by placer.
Output --> class BankFailed 



====> Observer Stopped


In [None]:
#df = pd.DataFrame(data, columns = ['Email' ,'Class'])
#df.head()

In [None]:
#df.to_csv(r'C:\Users\divya\Downloads\Email-Tracker-master\processedemails.csv', index = False)


In [None]:
#HandleNewEmail("t.txt")

In [9]:
df=pd.read_csv("processedemails.csv")
#df.drop(["index"],axis=1,inplace=True)
df.tail(1)

Unnamed: 0,Email,Class
1,Payment of 471862128 CAD to account id 101165 ...,BankFailed
