In [1]:
import pandas as pd
import numpy as np
from random import seed
from random import sample

seed(42)
np.random.seed(42)

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import gensim.downloader as api
from gensim.models.keyedvectors import Word2VecKeyedVectors

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from scipy import spatial

from nltk.corpus import stopwords

In [2]:
model = api.load('word2vec-google-news-300')

In [3]:
df = pd.read_csv("./emaildataset.csv", usecols = ['Subject','Body', 'Class'])
df.head()

Unnamed: 0,Subject,Body,Class
0,Transaction no. 072558 is unresolved.,Sorry to inform that there has been only a par...,Pending
1,Order for new Cheque book,"Good morning, I want to place an order for an ...",General
2,Required money acquired. Transaction 847047 is...,Hello! This is to inform you that I have recei...,Processing
3,Asking for the details for transaction 746078,I request you to kindly send the status of my ...,Request
4,Partial payment for transaction 535918,Hello!! Greetings for the day. Status of trans...,Pending


In [4]:
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

In [5]:
# subj, body to text

for i in range(df.shape[0]):
    # merge subject and body strings
    df['Text'] = (df['Subject'] + " " + df['Body'])

In [6]:
df['Text'] = df['Text'].apply(lambda x: get_only_chars(x))

In [7]:
df = df.drop_duplicates('Text')

In [8]:
# df.head()
# df.shape

In [9]:
# set the by default to :

num_classes = 6 # the number of classes we consider (since the dataset has many classes)
sample_size = 5 # the number of labeled sampled we’ll require from the user

In [10]:
from sklearn.preprocessing import LabelEncoder 

le = LabelEncoder()
# classes = df['Class'].values

# classes = classes.reshape(-1, 1)
# Y = le.fit_transform(classes)
df['Class'] = le.fit_transform(df['Class'])

In [11]:
# Generate samples that contains K samples of each class

def gen_sample(sample_size, num_classes):

    df_1 = df[(df["Class"] < num_classes + 1)].reset_index().drop(["index"], axis=1).reset_index().drop(["index"], axis=1)
    train = df_1[df_1["Class"] == np.unique(df_1['Class'])[0]].sample(sample_size)

    train_index = train.index.tolist()

    for i in range(1,num_classes):
        train_2 = df_1[df_1["Class"] == np.unique(df_1['Class'])[i]].sample(sample_size)
        train = pd.concat([train, train_2], axis=0)
        train_index.extend(train_2.index.tolist())

    test = df_1[~df_1.index.isin(train_index)]

    return train, test

In [12]:
# Apply that to the dataframe :

train, test = gen_sample(sample_size, num_classes)

X_train = train['Text']
y_train = train['Class'].values
X_test = test['Text']
y_test = test['Class'].values

In [13]:
# train.head(20)
print(train.values[2])

['Received full payment for transaction no. 556678'
 'To whom it may concern, I have successfully received payment for the transaction 556678 . I am grateful for your cooperation. Thank you so much and regards.'
 0
 'received full payment for transaction no to whom it may concern i have successfully received payment for the transaction i am grateful for your cooperation thank you so much and regards ']


In [14]:
test.head(20)

Unnamed: 0,Subject,Body,Class,Text
0,Transaction no. 072558 is unresolved.,Sorry to inform that there has been only a par...,3,transaction no is unresolved sorry to inform t...
1,Order for new Cheque book,"Good morning, I want to place an order for an ...",2,order for new cheque book good morning i want ...
2,Required money acquired. Transaction 847047 is...,Hello! This is to inform you that I have recei...,4,required money acquired transaction is in proc...
3,Asking for the details for transaction 746078,I request you to kindly send the status of my ...,5,asking for the details for transaction i reque...
4,Partial payment for transaction 535918,Hello!! Greetings for the day. Status of trans...,3,partial payment for transaction hello greeting...
6,Failure of transaction 608189,This is to notify you that my transaction 6081...,1,failure of transaction this is to notify you t...
7,Send steps to activate online banking,"Hey, I have to transfer funds to a different b...",2,send steps to activate online banking hey i ha...
8,Incomplete transaction 947071,I regret to inform you that I could only pay t...,3,incomplete transaction i regret to inform you ...
9,Failure of transaction 443004,"Hey, I see my transaction with ID 443004 has f...",1,failure of transaction hey i see my transactio...
10,The pending amount for transaction 537615 will...,"Since my transaction 537615 is still pending, ...",3,the pending amount for transaction will reach ...


In [15]:
print(X_train)

5      payment done and transaction settled greetings...
44     concluded the transaction greetings i wanted t...
234    received full payment for transaction no to wh...
290    fulfilled transaction having id glad to let yo...
72     completed transaction no i deeply appreciate y...
304    transaction having id has stopped help this is...
71     failure of transaction i have been your regula...
23     failure of transaction greetings for the day i...
201    urgent transaction ceased this is in response ...
243    transaction stalled and payment not received h...
114    something wrong with my account this is to inf...
380    change address for account no tailwater street...
306    change address for account no add my new addre...
38     want to block account hey i think i lost my at...
292    change address for account no this is to infor...
278    transaction no is unresolved the transaction i...
68     partially paid the required amount for transac...
70     the pending amount for t

In [16]:
print(X_train.shape)

(30,)


In [17]:
# Text processing (split, find token id, get embedidng)
def transform_sentence(text, model):

    """
    Mean embedding vector
    """

    def preprocess_text(raw_text, model=model):

        """ 
        Excluding unknown words and get corresponding token
        """

        raw_text = raw_text.split()

        return list(filter(lambda x: x in model.vocab, raw_text))

    tokens = preprocess_text(text)

    if not tokens:
        return np.zeros(model.vector_size)

    text_vector = np.mean(model[tokens], axis=0)

    return np.array(text_vector)

In [18]:
# Apply this to both the train and the test :

X_train_mean = X_train.apply(lambda x : transform_sentence(x, model))
X_test_mean = X_test.apply(lambda x : transform_sentence(x, model))

X_train_mean = pd.DataFrame(X_train_mean)['Text'].apply(pd.Series)
X_test_mean = pd.DataFrame(X_test_mean)['Text'].apply(pd.Series)

In [19]:
# Use cosine similarity to find closest class

def classify_txt(txt, mean_embedding):

    best_dist = 1
    best_label = -1

    for cl in range(num_classes):

        dist = spatial.distance.cosine(transform_sentence(txt, model), mean_embedding[cl])

        if dist < best_dist :
            best_dist = dist
            best_label = cl+1

    return best_label

In [20]:
# Process text and predict on the test set

def return_score(sample_size, num_classes):

    train, test = gen_sample(sample_size, num_classes)

    X_train = train['Text']
    y_train = train['Class'].values
    X_test = test['Text']
    y_test = test['Class'].values

    X_train_mean = X_train.apply(lambda x : transform_sentence(x, model))
    X_test_mean = X_test.apply(lambda x : transform_sentence(x, model))

    X_train_mean = pd.DataFrame(X_train_mean)['Text'].apply(pd.Series)
    X_test_mean = pd.DataFrame(X_test_mean)['Text'].apply(pd.Series)

    mean_embedding = {}
    for cl in range(num_classes):
        mean_embedding[cl] = np.mean((X_train_mean[y_train == cl + 1]), axis=0)

    y_pred = [classify_txt(t, mean_embedding) for t in test['Text'].values]

    return accuracy_score(y_pred, y_test)

In [22]:
# Now, we will iterate on the number of classes (between 2 and 7) and the number of samples (between 1 andd 50). 
# We will consider that labeling more than 50 training examples per class is too long.

all_accuracy = {6:[],7:[],8:[],9:[],10:[]}

for num_samples in range(1,50):
    for num_cl in range(6, 10):
        all_accuracy[num_cl].append(return_score(num_samples,num_cl))

IndexError: index 6 is out of bounds for axis 0 with size 6

In [None]:
# plotting accuracy for each no. of class, depending on the number of train examples :

plt.figure(figsize=(12,8))
plt.plot(all_accuracy[2], label="2 classes")
plt.plot(all_accuracy[3], label="3 classes")
plt.plot(all_accuracy[4], label="4 classes")
plt.plot(all_accuracy[5], label="5 classes")
plt.plot(all_accuracy[6], label="6 classes")
plt.axvline(7, c='black', alpha=0.5)
plt.title("Accuracy depending on the number of samples and classes")
plt.legend()
plt.show()