# Few-Shot Learning Email Classification with Pre-Trained Word2Vec Embeddings

In [1]:
import pandas as pd
import numpy as np
from random import seed
from random import sample
from wordfile import func
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import joblib
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from scipy import spatial
import os
import spacy
import en_core_web_sm

In [2]:
embeddings_index = {}
with open('/home/aheli/glove.6B.300d.txt',encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:],dtype='float32')
        embeddings_index[word] = coeffs
    f.close()

In [3]:
# from google.colab import files
# uploaded = files.upload()

In [152]:
seed(40) 
np.random.seed(40)

In [207]:
df = pd.read_csv("/home/aheli/Downloads/emaildataset.csv", usecols = ['Subject','Body', 'Class'])
df = df.replace(np.nan, '', regex=True)

In [208]:
df = df.sample(frac=1, random_state=10).reset_index(drop=True) # 10/12 rs=1, 11/13 rs=10

In [209]:
df.tail(50)

Unnamed: 0,Subject,Body,Class
954,Transaction 347606 is complete,I deeply appreciate your quick service as I ha...,Complete
955,Sorted out the transaction with ID : 686668,"To whom it may concern, I have successfully re...",Complete
956,Partially paid the required amount for transac...,"Hey, sincere apologies for transferring a frac...",Pending
957,Fulfilled transaction having ID : 860755,Hello! Sincere greetings for the day. I would ...,Complete
958,Finalized transaction of ID : 571919,I deeply appreciate your quick service as I ha...,Complete
959,Transaction 976478 stalled and payment not rec...,"Hey, I see my transaction with ID 976478 has f...",Failed
960,Payment received for transaction : 809031 and ...,Thank you for transferring the payment to my a...,Processing
961,Dealing with the transaction 045840,This is in response to your email notifying ab...,Processing
962,Payment done and Transaction 787228 settled.,"To whom it may concern, I have successfully re...",Complete
963,Partial payment for transaction 547345,I regret to inform you the I could only pay th...,Pending


In [159]:
nlp = spacy.load('en')

In [160]:
my_stop = ['\'d', '\'ll', '\'m', '\'re', '\'s', 'a','cc','subject','http', 'gbp', 'usd', 'eur', 'inr', 'cad', 'thanks', 'acc', 'id', 'account', 'regards', 'hi', 'hello', 'thank you', 'greetings', 'about','above', 'across','after','afterwards','alone','along','among', 'amongst','amount','an','and','another','any','anyhow','anyone','anything','anyway','anywhere','around','as', 'at','because','before','beforehand','behind','below', 'beside','besides','between','both','bottom','but','by','ca','call','can','could','did', 'do', 'does', 'doing', 'down', 'due', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'everyone', 'everything', 'everywhere', 'fifteen', 'fifty', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'four', 'from', 'front', 'further', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'if', 'indeed', 'into', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'many', 'may', 'me', 'meanwhile', 'might', 'mine', 'more', 'moreover', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'one', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'own', 'per', 'perhaps', 'please', 'quite', 'rather', 're', 'really', 'regarding', 'same','she', 'side', 'since', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'somewhere', 'such', 'ten', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'third', 'this', 'those', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'under', 'up', 'upon', 'us', 'using', 'various', 'via', 'we', 'well', 'whatever', 'whence', 'whenever', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'whoever', 'whole', 'whom', 'whose', 'with', 'within', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', '\'m', '\'re', '’s']

In [161]:
def get_only_chars(text):    
    text = text.replace("-", " ") #replace hyphens with spaces
    text = text.replace("\t", " ")
    text = text.replace("\n", " ")
    text = text.replace("n't", " not")
    text = text.replace("l've", "l have")
    text = text.replace("d've", "d have")

    
    text = nlp(text)
    text = " ".join(token.orth_ for token in text if not token.is_punct | token.is_space)
    t = ""

    for i in text.lower().split():
        if func(i) is not None:
            t += func(i) + " "
        else :
            t += i + " "

    t = t.rstrip()
    text = " ".join([i for i in t.lower().split() if i not in my_stop])
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = " ".join([i for i in text.split() if len(i) != 1])
    
    return text

In [162]:
print(get_only_chars("won't can't apple"))

wo not not apple


In [210]:
for i in range(df.shape[0]):
    # merge subject and body strings
    df['Text'] = (df['Subject'] + " " + df['Body'])

In [211]:
def converter(x):
    try:
        return ' '.join([x.lower() for x in str(x).split()])
    except AttributeError:
        return None  # or some other value

df['Text'] = df['Text'].apply(converter)

In [212]:
df['Text'] = df['Text'].apply(lambda x: get_only_chars(x))

In [213]:
df = df.drop_duplicates('Text')

In [214]:
df.shape

(420, 4)

In [100]:
df.Class.value_counts()

Request       81
General       75
Pending       67
Complete      57
Failed        53
Processing    45
CreditCard    42
Name: Class, dtype: int64

In [101]:
# set the by default to:
num_classes = df.Class.unique() # the number of classes we consider (since the dataset has many classes)
sample_size = 2 # the number of labeled sampled we’ll require from the user
print(num_classes)

['Pending' 'General' 'Processing' 'Request' 'Complete' 'Failed'
 'CreditCard']


In [168]:
# Generate samples that contains K samples of each class

def gen_sample(sample_size, num_classes):
    
    # df.sample(frac=1).reset_index(drop=True)

    df_1 = df[(df["Class"] < num_classes)].reset_index().drop(["index"], axis=1).reset_index().drop(["index"], axis=1)
    
    train = df_1[df_1["Class"] == np.unique(df_1['Class'])[0]].sample(sample_size)
#     return train
    train_index = train.index.tolist()

    for i in range(1,num_classes):
        train_2 = df_1[df_1["Class"] == np.unique(df_1['Class'])[i]].sample(sample_size)
        train = pd.concat([train, train_2], axis=0)
        train_index.extend(train_2.index.tolist())

    test = df_1[~df_1.index.isin(train_index)]
    return train, test

In [229]:
from sklearn.preprocessing import LabelEncoder 

le = LabelEncoder()
df['Class'] = le.fit_transform(df['Class'])

In [230]:
df.head(20)

Unnamed: 0,Subject,Body,Class,Text
0,Handling the transaction 128874 after payment.,This is in response to your email notifying ab...,5,handling transaction payment is in response em...
1,Received full payment for transaction no. 338874,"Hey, I am writing in reference to the transact...",0,received full payment transaction hey am writi...
2,How to change PIN no of ATM card?,"Hey, Can you please send me the detailed steps...",3,change pin atm card hey send detailed steps ou...
3,Sorted out the transaction with ID : 619734,Greetings! I wanted to let you know that I hav...,0,sorted transaction wanted let know have acknow...
4,Transaction 118350 stalled and payment not rec...,This is in response to your email stating that...,2,transaction stalled payment not received is in...
5,Request to send details of transaction 980173,I want to have a detailed view of my account. ...,6,request send details transaction want have det...
6,Handling the transaction 205235 after payment.,Acknowledging the received payment for transac...,5,handling transaction payment acknowledging rec...
7,Partially paid the required amount for transac...,There has been only a partial payment of amoun...,4,partially paid required transaction has been o...
8,Finalized transaction of ID : 239797,"Hey, I am writing in reference to the transact...",0,finalized transaction hey am writing in refere...
9,,This is to inform you that I am undergoing a f...,1,is inform am undergoing financial issue am una...


In [231]:
df.Class.value_counts()

6    81
3    75
4    67
0    57
2    53
5    45
1    42
Name: Class, dtype: int64

In [232]:
for i in range(0, len(df.Class.unique())):
    print(i)
    print(le.inverse_transform([i]))

0
['Complete']
1
['CreditCard']
2
['Failed']
3
['General']
4
['Pending']
5
['Processing']
6
['Request']


In [233]:
# train, test = gen_sample(30, 7)

In [234]:
# train.Class.value_counts()

In [235]:
# test.Class.value_counts()

In [236]:
def transform_sentence(text, embeddings_index):

    def preprocess_text(raw_text, model=embeddings_index):

        raw_text = raw_text.split()
        return list(filter(lambda x: x in embeddings_index.keys(), raw_text))

    tokens = preprocess_text(text)

    if not tokens:
        return np.zeros(300)

    c = [embeddings_index[i] for i in tokens]
    text_vector = np.mean(c, axis=0)
    return np.array(text_vector)

In [237]:
if not os.path.exists('/home/aheli/SmartEmailTracker/Merged UI_Listener/pkl_objects'):
        os.mkdir('/home/aheli/SmartEmailTracker/Merged UI_Listener/pkl_objects')
    
joblib.dump(le, '/home/aheli/SmartEmailTracker/Merged UI_Listener/pkl_objects/labelencoder3July.pkl')

['/home/aheli/SmartEmailTracker/Merged UI_Listener/pkl_objects/labelencoder3July.pkl']

## Pre-trained Glove embeddings and ML algorithms

In [238]:
# Install with below cell if you're not able to install on terminal

# import sys
# !{sys.executable} -m pip install xgboost

In [239]:
import xgboost

In [240]:
# def return_score_xgb(sample_size, num_classes, df):

#     train, test = gen_sample(sample_size, num_classes, df=df)

#     X_train = train['Text'].values
#     y_train = train['Class'].values
#     X_test = test['Text'].values
#     y_test = test['Class'].values

#     X_train_mean = np.array([transform_sentence(x, embeddings_index) for x in X_train])
#     X_test_mean = np.array([transform_sentence(x, embeddings_index) for x in X_test])

# #     XG Boost
#     clf = xgboost.XGBClassifier()
    
#     eval_set = [(X_train_mean, y_train), (X_test_mean, y_test)]
#     eval_metric = ["auc","error", "logloss"]
#     %time clf.fit(X_train_mean, y_train, early_stopping_rounds=10, eval_metric="merror", eval_set=eval_set, verbose=True)
# #     clf.fit(X_train_mean, y_train)

#     joblib.dump(clf, './clf.pkl')

#     y_pred = clf.predict(X_test_mean)
    
#     # evaluate predictions
#     accuracy = accuracy_score(y_pred, y_test)
#     print("Accuracy: %.2f%%" % (accuracy * 100.0))

#     return accuracy_score(y_pred, y_test)

In [241]:
def return_score_xgb(sample_size, num_classes):

    train, test = gen_sample(sample_size, num_classes)

    X_train = train['Text'].values
    y_train = train['Class'].values
    X_test = test['Text'].values
    y_test = test['Class'].values

    X_train_mean = np.array([transform_sentence(x, embeddings_index) for x in X_train])
    X_test_mean = np.array([transform_sentence(x, embeddings_index) for x in X_test])

#     XG Boost
    clf = xgboost.XGBClassifier()
    clf.fit(X_train_mean, y_train)
    
    joblib.dump(clf, './clf.pkl')

    y_pred = clf.predict(X_test_mean)
    
#     evaluate predictions
    accuracy = accuracy_score(y_pred, y_test)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

    return accuracy_score(y_pred, y_test)

In [242]:
# all_accuracy_xgb = {2:[],3:[],4:[],5:[],6:[]}

# for num_samples in range(1, 40):

#     for num_cl in range(2, 7):

#         all_accuracy_xgb[num_cl].append(return_score_xgb(num_samples,num_cl))

In [243]:
all_accuracy_xgb = {0:[]}

for num_samples in range(1, 40):

    all_accuracy_xgb[0].append(return_score_xgb(num_samples,len(df.Class.unique())))

Accuracy: 13.56%
Accuracy: 74.63%
Accuracy: 56.14%
Accuracy: 46.43%
Accuracy: 70.39%
Accuracy: 71.96%
Accuracy: 73.05%
Accuracy: 75.55%
Accuracy: 72.83%
Accuracy: 82.57%
Accuracy: 81.63%
Accuracy: 87.80%
Accuracy: 82.67%
Accuracy: 83.85%
Accuracy: 84.44%
Accuracy: 88.64%
Accuracy: 87.04%
Accuracy: 92.18%
Accuracy: 92.33%
Accuracy: 90.36%
Accuracy: 86.08%
Accuracy: 90.60%
Accuracy: 93.44%
Accuracy: 92.86%
Accuracy: 87.35%
Accuracy: 89.92%
Accuracy: 89.18%
Accuracy: 88.84%
Accuracy: 89.86%
Accuracy: 95.24%
Accuracy: 95.07%
Accuracy: 92.35%
Accuracy: 92.59%
Accuracy: 95.05%
Accuracy: 96.57%
Accuracy: 93.45%
Accuracy: 91.93%
Accuracy: 92.86%
Accuracy: 90.48%


## Summary

In [244]:
# df_results = pd.DataFrame({
    
#     'Nb Classes':[2, 3, 4, 5, 6], 

#     'mean XG Boost':[np.mean(all_accuracy_xgb[2]), 
#         np.mean(all_accuracy_xgb[3]), 
#         np.mean(all_accuracy_xgb[4]), 
#         np.mean(all_accuracy_xgb[5]),
#         np.mean(all_accuracy_xgb[6])],
#     'max XG Boost':[max(all_accuracy_xgb[2]), 
#         max(all_accuracy_xgb[3]), 
#         max(all_accuracy_xgb[4]), 
#         max(all_accuracy_xgb[5]),
#         max(all_accuracy_xgb[6])]
#     })

In [245]:
# df_results

## Testing on incoming email 

In [246]:
le = joblib.load('/home/aheli/SmartEmailTracker/Merged UI_Listener/pkl_objects/labelencoder3July.pkl')
clf = joblib.load('./clf.pkl')

In [247]:
def is_empty_sent(cd):
    all_zeros = not cd.any()
    return all_zeros

print(is_empty_sent(transform_sentence("efrg vftrg ojinc", embeddings_index)))

True


In [248]:
def find_id(sub):
    """
    extract transaction id from email (subject + body)
    """
    nums = []
    res = ''
    text = re.sub(r'[^0-9]', ' ', sub)
    sub = sub.lower()
    for t in text.split():
        try:
            nums.append(t)
        except ValueError:
            pass
    if not nums:
        res = None
        return res
        
    def func(sub, nums):
        end_idx = 0
        for i in nums:
            start_idx = sub.find(i)
            if "trans id" in sub[max(0, start_idx - 10) : start_idx]:
                return i,True
            elif "transaction id" in sub[max(0, start_idx - 16) : start_idx]:
                return i,True
            elif "number" in sub[max(0, start_idx - 8) : start_idx]:
                return i,True
            elif "no." in sub[max(0, start_idx - 5) : start_idx]:
                return i,True
            elif "num" in sub[max(0, start_idx - 5) : start_idx]:
                return i,True
            elif "id" in sub[max(0, start_idx - 4) : start_idx]:
                return i,True
        return "",False
    
    num_str, boolean = func(sub, nums)
    if boolean is True:
        return num_str
    return None


In [249]:
def find_amt(s):
    """
    extract transaction amount from email (subject + body)
    """
    nums = []
    res = ''
    text = re.sub(r'[^0-9]', ' ', s)
    s = s.lower()
    for t in text.split():
        try:
            t = " " + t + " "
            nums.append(t)
        except ValueError:
            pass
    if not nums:
        res = None
        return res
    def func(s, nums):
        end_idx = 0
        for i in nums:
            start_idx = s.find(i)
            end_idx = start_idx + len(i)
            if "usd" in s[max(0, start_idx - 5) : start_idx] or "usd" in s[end_idx : min(len(s) - 1, end_idx + 5)]:
                return i,True
            elif "cad" in s[max(0, start_idx - 5) : start_idx] or "cad" in s[end_idx : min(len(s) - 1, end_idx + 5)]:
                return i,True
            elif "inr" in s[max(0, start_idx - 5) : start_idx] or "inr" in s[end_idx : min(len(s) - 1, end_idx + 5)]:
                return i,True
            elif "gbp" in s[max(0, start_idx - 5) : start_idx] or "gbp" in s[end_idx : min(len(s) - 1, end_idx + 5)]:
                return i,True
            elif "usd" in s[max(0, start_idx - 5) : start_idx] or "usd" in s[end_idx : min(len(s) - 1, end_idx + 5)]:
                return i,True
            elif "rs" in s[max(0, start_idx - 4) : start_idx] or "rs" in s[end_idx : min(len(s) - 1, end_idx + 4)]:
                return i,True
            elif "rupees" in s[max(0, start_idx - 8) : start_idx] or "rupees" in s[end_idx : min(len(s) - 1, end_idx + 8)]:
                return i,True
            
        return "",False
    
    num_str, boolean = func(s, nums)
    if boolean is True:
        return num_str
    return None

In [250]:
def inp(emailto, emailfrom, subj, bod):
    text = subj + " " + bod
    t_id = find_id(text)
    t_amt = find_amt(text)

    text = get_only_chars(text)
    X_test_mean = np.array([transform_sentence(text, embeddings_index)])
    
    if is_empty_sent(X_test_mean) is True:
        l = ["Unable to read email.Please ensure that it is in English!"]
        return np.array(l)[0], t_id

    y_pred = clf.predict(X_test_mean)

    out = le.inverse_transform(y_pred)
    return out[0], t_id, t_amt

In [251]:
print(inp("fvf", "defrfg", "payment processed", "hi, the payment for id 1234 for usd 3456 was paid successfully."))

('Complete', '1234', ' 3456 ')


In [266]:
print(inp("cfdfv", "derftrg", "Partially paid the required amount for transaction", "There has been only a partial payment of amount 1234"))

('Complete', None, None)


In [253]:
print(inp("Jason@DEUTSCHEBANK.com", "Suhail@CitiBankPune.com", "Want to know more about being a premium customer", "Sincere greetings. I recently came across an offer saying I would have higher interest rates if I am willing to upgrade my account to a premium one. I want to base my decision on its pros as well as cons. So perhaps you could throw some light on it? That woud be very kind of you."))

('General', None, None)


In [254]:
print(inp("qwrqef", "cfdvfv", "", "Payment of 471862128 CAD to account id 101165 has been made on 19/02/2020 and is in progress, please acknowledge."))

('Processing', '101165', ' 471862128 ')


In [255]:
print(inp("RFg", "ewrdef", "what is the status of my transaction: 12345", "hello "))

('Request', None, None)


In [256]:
print(inp("abdf@", "fvfb","payment for ID 3456 done", "I'm glad to inform you that your payment was successfully transferred to the bank account."))

('Complete', '3456', None)


In [257]:
print(inp("jhnujfv", "hvgd", "ijigrtd", "kjhn sdrdsa kuyhn"))

('Unable to read email.Please ensure that it is in English!', None)


In [258]:
print(inp("RFg", "ewrdef", "payment for ID 34565 in progress", "hello you payment is being processed"))

('Processing', '34565', None)


In [259]:
print(inp("RFg", "ewrdef", "payment for ID 34565 processed", "hello your payment has been made."))

('Processing', '34565', None)


In [260]:
print(inp("", "", "your payment was processed.", "Your payment was done."))

('Complete', None, None)


In [261]:
print(inp("", "", "Your payment is being processed", "Payment is being done."))

('Processing', None, None)


In [262]:
print(inp("", "", "your payment was processed.", "Payment is done."))

('Complete', None, None)


In [204]:
print(find_id("payment for ID 34565 in progress hello payment for usd 50000 is being processed"))

34565


In [265]:
print(inp("", "", "your payment was partially paid.", "Only received 233435 usd, rest is incomplete. Last date is tomorrow."))

('Pending', None, ' 233435 ')


In [205]:
# def find_amt(s):
#     """
#     extract transaction amount from email (subject + body)
#     """
#     nums = []
#     res = ''
#     text = re.sub(r'[^0-9]', ' ', s)
#     s = s.lower()
#     for t in text.split():
#         try:
#             nums.append(t)
#         except ValueError:
#             pass
#     if not nums:
#         res = None
#         return res
        
#     def func(sub, nums):
#         for i in nums:
#             start_idx = sub.find(i)
#             end_idx = start_idx + len(i)
#             if "usd" in sub[start_idx - 5 : start_idx] or "usd" in sub[end_idx : end_idx + 5]:
#                 return i,True
#             elif "cad" in sub[start_idx - 5 : start_idx] or "cad" in sub[end_idx : end_idx + 5]:
#                 return i,True
#             elif "inr" in sub[start_idx - 5 : start_idx] or "inr" in sub[end_idx : end_idx + 5]:
#                 return i,True
#             elif "gbp" in sub[start_idx - 5 : start_idx] or "gbp" in sub[end_idx : end_idx + 5]:
#                 return i,True
#             elif "usd" in sub[start_idx - 5 : start_idx] or "usd" in sub[end_idx : end_idx + 5]:
#                 return i,True
#             elif "rs" in sub[start_idx - 4 : start_idx] or "rs" in sub[end_idx : end_idx + 4]:
#                 return i,True
#             elif "rupees" in sub[start_idx - 8 : start_idx] or "rupees" in sub[end_idx : end_idx + 8]:
#                 return i,True
#         return "",False
    
#     num_str, boolean = func(s, nums)
#     if boolean is True:
#         return num_str
#     return None

In [206]:
print(find_amt("payment for ID 34565 in progress hello payment for usd 50000 is being processed"))

 50000 


In [267]:
print(inp("", "", "Info on new credit card", "I have applied for a new credit card and havent received it yet. Please send more info."))

('CreditCard', None, None)
