In [None]:
import os

download_name = "sms_spam_with_splits.csv.bz2"
if not os.path.exists(download_name):
    import requests
    response = requests.get(f"https://raw.githubusercontent.com/bzitko/nlp_repo/main/assignments/a02/{download_name}")
    with open(download_name, "wb") as fp:
        fp.write(response.content)
    response.close()
        
name = "sms_spam_with_splits.csv"
if not os.path.exists(name):
    import bz2
    with open(download_name, 'rb') as bzf, open(name, 'wb') as fp:
        fp.write(bz2.decompress(bzf.read()))    

# SMS Classify

In [None]:
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

In [None]:
args = Namespace(
    # Data and Path information
    frequency_cutoff=25,
    sms_spam_csv='sms_spam_with_splits.csv',
    
    # No Model hyper parameters
    
    # Training hyper parameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    
    # Runtime options
    catch_keyboard_interrupt=True,
    cuda=True,

)

### Vocabulary
👍 Create Vocabulary class with following methods:
* `__init__()` which initializes vocabulary by enabling or disabling unknown token `<UNK>` through `add_unk` argument (default=True)
* `add_token()` adding token to vocabulary
* `add_many()` adding multiple tokens into vocabulary
* `lookup_token()` returns index for given token
* `lookup_index()` return token for given index
* `__len__()` return size of vocabulary

In [None]:
class Vocabulary(object):
    def __init__(self, add_unk=True, unk_token="<UNK>"):
        pass
        
    def add_token(self, token):
        return
    
    def add_many(self, tokens):
        return

    def lookup_token(self, token):
        return

    def lookup_index(self, index):
        return
    
    def __len__(self):
        return
    
vocab = Vocabulary(add_unk=True)

vocab.add_token("john")
vocab.add_token("john")
vocab.add_token("ann")

vocab.lookup_token("john"), vocab.lookup_index(2)

### Vectorizer

👍 Create `SMSVectorizer` class with following methods:
* `__init__(self, sms_vocab, cls_vocab)` which will receive sms vocabulary (text of sms-es) and class vocabulary (ham or spam)
* `vectorize(self, sms)` returns one-hot representation of a given sms text.
* class method `from_dataframe(cls, sms_df, cutoff=25)` return SMSVectorizer instance by reading data from SMS dataframe and keep tokens whose frequency is larger then cutoff.

In [None]:
class SMSVectorizer(object):
    def __init__(self, sms_vocab, cls_vocab):
        pass

    def vectorize(self, sms):
        return

    @classmethod
    def from_dataframe(cls, sms_df, cutoff=25):
        return

In [None]:
sms_vocab = Vocabulary()
sms_vocab.add_many("john has been there . but he is not john .".split())

cls_vocab = Vocabulary(add_unk=False)
cls_vocab.add_many(["spam", "ham"])

vectorizer = SMSVectorizer(sms_vocab, cls_vocab)
vectorizer.vectorize("but john has not been he .")

### Read Dataset

👍 Create `SMSDataset` class which inherits `torch.utils.data.DataSet` and put following methods:
* `__init__(self, sms_df, vectorizer)` which will receive SMS dataframe and vectorizer and store them as attributes. SMS dataframe will be splitted into 3 dataframes according to split value (train, val or test) and those dataframes will be also stored as attributes.
* `set_split(self, split)` used for sellecting current split dataframe
* `__len__(self)` returns length of current split dataframe
* `__getitem__(self, index)` returns vectorized sms text and class for datapoint at given index
* `__get_num_batches(self, batch_size)` returns number of batches according to length of current split dataframe and batch size.
* class method `load_dataset_and_make_vectorizer(cls, sms_csv)` loads `sms_csv` file and makes vectorizer from **train** split. Then returns instance of `SMSDataset` with those two arguments.


In [None]:
class SMSDataset(Dataset):
    def __init__(self, sms_df, vectorizer):
        pass

    @classmethod
    def load_dataset_and_make_vectorizer(cls, sms_csv):
        return

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        pass

    def __len__(self):
        return

    def __getitem__(self, index):
        return

    def get_num_batches(self, batch_size):
        return len(self) // batch_size  


👍 Create function `generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu")` which will use `torch.utils.data.DataLoader` to yield batch.

In [None]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    yield

## Load dataset and dataloader

In [None]:
dataset = SMSDataset.load_dataset_and_make_vectorizer(args.sms_spam_csv)
dataloader = DataLoader(dataset=dataset)

### Plot data

👍 Reduce dimensionality of data to 2D and plot spam and ham datapoints into separate plots.

In [None]:
from sklearn import decomposition
import matplotlib.pyplot as plt

### Model Classifier

👍 Create `SMSClassifier` class as perceptron with variable number of features. Methods are:
* `__init__(self, num_features)` initializes perceptron with `num_features` inputs
* `forward(self, x_in, apply_sigmoid=False)` for given input `x_in` makes forward step and eventually applies sigmoid on output.

In [None]:
class SMSClassifier(nn.Module):

    def __init__(self, num_features):
        pass

    def forward(self, x_in, apply_sigmoid=False):
        return


### Initialization

👍 Initialize by following steps:
* classifier
* args.device - setting to cuda if args.cuda is true and cuda is available, to cpu otherwise.
* switch classifier to args.device

# Training

👍 Set:
* loss function to Binart Cross Entropy
* optimizer as Adam with specified learning rate by args.lr
* learning rate scheduler for reducing learning rate by 
    * factor 10 times less than actual args.lr
    * number of epochs with no improvement (patience) set to 1
    * minimum mode
    
For each epoch
* generate train batches and train (set classifier to train)
* generate val batches and validate (set classifier to eval)

In each batch print accuracy and loss.

# Test

👍 Generate test batches and for each batch print accuracy

# Interpretability

👍 Get classifier weights to determine 20 influental words for each class