In [None]:
# Assignment: Classification

Classification refers to categorizing the given data into classes. For example,
- Given an image of hand-written character, identifying the character (multi-class classification)
- Given an image, annotating it with all the objects present in the image (multi-label classification)
- Classifying an email as spam or non-spam (binary classification)
- Classifying a tumor as benign or malignant and so on

In this assignment, we will be building a classifier to classify emails as spam or non-spam. We will be using the Kaggle dataset [Spam or Not Spam Dataset](https://www.kaggle.com/datasets/ozlerhakan/spam-or-not-spam-dataset?resource=download) for this task. 

**Note**: You cannot load any libraries other than the mentioned ones.

In [None]:
### Data pre-processing
The first step in every machine learning algorithm is to process the raw data in some meaningful representations. We will be using the [Bag-of-Words](https://towardsdatascience.com/a-simple-explanation-of-the-bag-of-words-model-b88fc4f4971) representation to process the text. It comprises of following steps:

- Process emails line-by-line to extract all the words.
- Replace extracted words by their stem (root) word. This is known as stemming and lematization.
- Remove stop words like and, or, is, am, and so on.
- Assign a unique index to each word. This forms the vocabulary.
- Represent each email as a binary vector of length equal to the size of the vocabulary such that the $i^{th}$ element of the vector is 1 iff the $i^th$ word is present in the email.

Here we provide you with the function signature along with the expected functionality. You are expected to complete them accordingly. 

In [None]:
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
# takes an email as an argument
# read email line-by-line and extract all the words
# return list of exracted words
def read_email(email):
    words = email.split(" ")
    return words

In [None]:
# takes a list of words as an argument
# replace each word by their stem word
# return list of stem words
def stemming(list_of_words):
    porter = PorterStemmer()
    stem_words = [porter.stem(word) for word in list_of_words]
    return stem_words

In [None]:
# takes a list of stem-words as an argument
# remove stop words
# return list of stem words after removing stop words
def remove_stop_words(list_of_stem_words):
    stop_words = set(stopwords.words('english'))
    stem_no_stop_words = [w for w in list_of_stem_words if not w.lower() in stop_words]
    return stem_no_stop_words

In [None]:
# takes a list of stem-words as an argument
# add new words to the vocabulary and assign a unique index to them
# returns new vocabulary
def build_vocabulary(list_of_stem_words):
    vocabulary = set()
    for word_list in list_of_stem_words:
        for word in word_list:
            vocabulary.add(word)
    vocab = [word for word in vocabulary]
    vocab.pop(0)

    return vocab

In [None]:
# takes a list of stem-words and vocabulary as an argument
# returns bow representation
def get_bow(list_of_stem_words, vocab):
    email_bow = []

    for data_list in list_of_stem_words:
        dict = {}
        for word in vocab:
            dict.update({word:0})
        for word in vocab:
            if word in data_list:
                dict[word] += 1
        email_bow.append(dict)

    return email_bow

In [None]:
# read the entire list_of_stem_words
# convert emails to bow and maintain their labels
# call function text_to_bow()
def read_data():
    results = []
    with open('spam_or_not_spam.csv', 'r', encoding='utf8') as f:
        for line in f:
                words = line.split(',')
                results.append([words[0], words[1][0][0]])
    spam_or_not_spam = results
    spam_or_not_spam.pop(0)
    list_of_stem_words = []
    data = []
    for mail in spam_or_not_spam:
        list_of_stem_words.append(remove_stop_words(stemming(read_email(mail[0]))))
    vocabulary = build_vocabulary(list_of_stem_words)
    vocabulary.pop(0)
    bows = get_bow(list_of_stem_words, vocabulary)
    for i in range(0,len(spam_or_not_spam)):
        data.append([bows[i], int(spam_or_not_spam[i][1])])
    
    return data,vocabulary