# Third Year Project

### Download & Import Libraries

In [None]:
# Download libraries unsupported by colab 
# (Uncomment and run below lines if you use Google Colab)

# %pip install contextualSpellCheck

In [1]:
import re, csv, json, datetime
import tweepy
import nltk, spacy #contextualSpellCheck
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import textdistance
# from textblob import TextBlob, Word   # Spell Correction
# from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

# --------- Machine Learning ---------
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split
# from torch.utils.tensorboard import SummaryWriter
# --------- Machine Learning ---------

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/anqitang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Create development dataset (Only run if you don't have dev dataset)

In [41]:
# dataset_path = '/content/drive/MyDrive/Colab Notebooks/year_project/dataset/covid19_tweets.csv'
dataset_path = './dataset/covid19_tweets.csv'

# Read dataset from a csv file
df = pd.read_csv(dataset_path).sample(n=500)   # Get 500 samples
new_df = pd.DataFrame()

# Save dataframe for development
new_df['date'] = df['date']
new_df['text'] = df['text']
new_df.to_csv('./dataset/dev_dataset.csv')

### Read Data

In [67]:
# dataset_path = '/content/drive/MyDrive/Colab Notebooks/year_project/dataset/covid19_tweets.csv'
dataset_path = './dataset/dev_dataset.csv'

# Read dataset from a csv file
df = pd.read_csv(dataset_path).sample(n=500)   # Get 500 samples

# Retrieve date and tweet content from DataFrame
# pattern_date = r'(\d{4})-(\d{2})-(\d{2})'
# raw_dates = df['date'].apply(lambda x: re.match(pattern_date, x).group(0)).to_numpy()
raw_dates = df['date'].to_numpy()
raw_tweets = df['text'].to_numpy()

### Split data

Split data before pre-processing, to avoid data leakage

In [68]:
# Split index for training and valication
index = np.arange(len(df))
i_train, i_val  = train_test_split(index, test_size=0.2, random_state=1)

# Get training data and validation data
train_tweets, train_dates = raw_tweets[i_train], raw_dates[i_train]
val_tweets, val_dates = raw_tweets[i_val], raw_dates[i_val]

## Pre-processing

### Spell Correction

#### TextBlob library
TextBlob("sentence ...").correct()

    Time:
        100tweets / 51s
    Performance:
        As tweets are not likely to be 100% grammatically correct,
        this library sometimes makes false correction.
        For example, "Trump" was changed to "Plump"
    

    


In [69]:
class PreProcessor:
    '''
    Pre-processing tweets:
        1) Clean data
        2) Tokenisation
        3) Spell Correction
    '''

    nlp = spacy.load('en_core_web_sm')
    # Regular Expressions for removals
    # re_url = r"(https://|http://|)[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
    # re_username = r"(^|[^@\w])@(\w{1,15})\b"    # Example: @username
    # patterns = f"{re_url}|{re_username}"

    def _clean_tweet(self, tweet: str) -> str:
        # Remove: URL, username
        # tweet = re.sub(self.patterns, '', tweet)

        # Remove extra whitespace
        #   1) Replace all kinds of whitespace to exactly one space.
        #   2) Remove leading and trailing whitespaces
        tweet = re.sub('\s+', ' ', tweet).strip()

        return tweet

    def _tokenize(self, tweet: str) -> list:
        # contextualSpellCheck.add_to_pipe(nlp)   # Add Spell Checker to Spacy pipeline
        
        # Tokenisation
        doc = self.nlp(tweet)                          # Tokenisation
        
        # TODO Spell correction
        # doc = nlp(doc._.outcome_spellCheck)
        
        # TODO Remove stopwords

        return doc

    def _remove_stop_words(self, tokens: list) -> list:
        return

    def process_tweets(self, tweets: list) -> list:
        result = []

        for tweet in tweets:

            # Remove irrelevant and personal data
            cleaned_tweet = self._clean_tweet(tweet)

            tokens = self._tokenize(cleaned_tweet)

            result.append(tokens)

        return np.asarray(result, dtype=object)     # Add "dtype=object" to mute a warning


In [71]:
pp = PreProcessor()

pattern_date = r'(\d{4})-(\d{2})-(\d{2})'

# Tokenisation
train_tokens = pp.process_tweets(train_tweets)
# Retrieve date (i.e. remove time)
train_dates = np.array([re.match(pattern_date, date).group(0) for date in train_dates])

In [None]:
        # /--------------- pySpellChecker 1 ---------------\
        # spell = SpellChecker()
        # tokens = [spell.correction(tk) if spell.correction(tk) else tk for tk in tokens]
        # /--------------- pySpellChecker 2 ---------------\
        # misspelled = spell.unknown(tokens)
        # for typo in misspelled:
        #     correction = spell.correction(typo)
        #     if not correction:  
        #         # If correction is None, the word may be a proper noun rather than misspelled
        #         continue
        #     try:
        #         i = tokens.index(typo)
        #         tokens[i] = correction
        #     except:
        #         print(typo)
        #         print(tokens)


        # Remove punctuations
        # tokens = [re.sub(r'[^A-Za-z0-9]+', '', tk) for tk in tokens]
        # Remove empty tokens (Produced when removing punctuations)
        # tokens = [tk for tk in tokens if tk]

['Hey', 'and', '-', 'would', "n't", 'it', 'have', 'made', 'more', 'sense', 'to', 'have', 'the', 'players', 'pay', 'their', 'respects', 'to', 'the', 'A', '…']
<class 'spacy.tokens.doc.Doc'>


## Pytorch Model

### Data Preparation

In [None]:
# TODO Convert preprocessed data into the x, y below
x, y = None, None

torch.manual_seed(13)
x_tensor = torch.as_tensor(x).float()
y_tensor = torch.as_tensor(y).float()
# Build dataset containing ALL data points
dataset = TensorDataset(x_tensor, y_tensor)
# Performs the split
ratio = .8
n_total = len(dataset)
n_train = int(n_total * ratio)
n_val = n_total - n_train
train_data, val_data = random_split(dataset, [n_train, n_val])
# Builds a loader of each set
train_loader = DataLoader(
    dataset=train_data,
    batch_size=16,
    shuffle=True
)
val_loader = DataLoader(dataset=val_data, batch_size=16)


### Model Configuration

In [None]:
# Set learning rate
lr = 0.1

torch.manual_seed(42)

# Create a model
# model = nn.Sequential(nn.Linear(1, 1))
model = nn.Sequential()
model.add_module('linear', nn.Linear(2, 1))

# Define a SGD optimizer to update the parameters (now retrieved directly from the model)
optimizer = optim.SGD(model.parameters(), lr=lr)

# Define a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

### Model Training

In [None]:
class NLPModel(object):
    def __init__(self, model, loss_fn, optimizer):
        # Store the arguments as attributes for later use
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # Let's send the model to the specified device right away
        self.model.to(self.device)
        
        # These attributes are defined here, but since they are
        # not available at the moment of creation, we keep them None
        self.train_loader = None
        self.val_loader = None
        self.writer = None

        # These attributes are going to be computed internally
        self.losses = []
        self.val_losses = []
        self.total_epochs = 0

        # Creates the train_step function for our model, 
        # loss function and optimizer
        # Note: there are NO ARGS there! It makes use of the class
        # attributes directly
        self.train_step = self._make_train_step_fn()
        # Creates the val_step function for our model and loss
        self.val_step = self._make_val_step_fn()

    def to(self, device):
        # This method allows the user to specify a different device
        # It sets the corresponding attribute (to be used later in
        # the mini-batches) and sends the model to the device
        try:
            self.device = device
            self.model.to(self.device)
        except RuntimeError:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
            print(f"Couldn't send it to {device}, sending it to {self.device} instead.")
            self.model.to(self.device)

    def set_loaders(self, train_loader, val_loader=None):
        # This method allows the user to define which train_loader 
        # (and val_loader, optionally) to use
        # Both loaders are then assigned to attributes of the class
        # So they can be referred to later
        self.train_loader = train_loader
        self.val_loader = val_loader

    def set_tensorboard(self, name, folder='runs'):
        # This method allows the user to create a SummaryWriter to 
        # interface with TensorBoard
        suffix = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
        self.writer = SummaryWriter(f'{folder}/{name}_{suffix}')

    def _make_train_step_fn(self):
        # This method does not need ARGS... it can refer to
        # the attributes: self.model, self.loss_fn and self.optimizer

        # Builds function that performs a step in the train loop
        def perform_train_step_fn(x, y):
            # Sets model to TRAIN mode
            self.model.train()

            # Step 1 - Computes our model's predicted output - forward pass
            yhat = self.model(x)
            # Step 2 - Computes the loss
            loss = self.loss_fn(yhat, y)
            # Step 3 - Computes gradients for both "b" and "w" parameters
            loss.backward()
            # Step 4 - Updates parameters using gradients and the
            # learning rate
            self.optimizer.step()
            self.optimizer.zero_grad()

            # Returns the loss
            return loss.item()

        # Returns the function that will be called inside the train loop
        return perform_train_step_fn

    def _make_val_step_fn(self):
        # Builds function that performs a step in the validation loop
        def perform_val_step_fn(x, y):
            # Sets model to EVAL mode
            self.model.eval()

            # Step 1 - Computes our model's predicted output - forward pass
            yhat = self.model(x)
            # Step 2 - Computes the loss
            loss = self.loss_fn(yhat, y)
            # There is no need to compute Steps 3 and 4, 
            # since we don't update parameters during evaluation
            return loss.item()
    
    def _mini_batch(self, validation=False):
        # The mini-batch can be used with both loaders
        # The argument `validation`defines which loader and 
        # corresponding step function is going to be used
        if validation:
            data_loader = self.val_loader
            step_fn = self.val_step_fn
        else:
            data_loader = self.train_loader
            step_fn = self.train_step_fn

        if data_loader is None:
            return None

        # Once the data loader and step function, this is the same
        # mini-batch loop we had before
        mini_batch_losses = []
        for x_batch, y_batch in data_loader:
            x_batch = x_batch.to(self.device)
            y_batch = y_batch.to(self.device)

            mini_batch_loss = step_fn(x_batch, y_batch)
            mini_batch_losses.append(mini_batch_loss)

        loss = np.mean(mini_batch_losses)

        return loss

    def set_seed(self, seed=42):
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False    
        torch.manual_seed(seed)
        np.random.seed(seed)

    def train(self, n_epochs, seed=42):
        # To ensure reproducibility of the training process
        self.set_seed(seed)
        
        for epoch in range(n_epochs):
            # Keeps track of the numbers of epochs
            # by updating the corresponding attribute
            self.total_epochs += 1

            # inner loop
            # Performs training using mini-batches
            loss = self._mini_batch(validation=False)
            self.losses.append(loss)

            # VALIDATION
            # no gradients in validation!
            with torch.no_grad():
                # Performs evaluation using mini-batches
                val_loss = self._mini_batch(validation=True)
                self.val_losses.append(val_loss)

            # If a SummaryWriter has been set...
            if self.writer:
                scalars = {'training': loss}
                if val_loss is not None:
                    scalars.update({'validation': val_loss})
                # Records both losses for each epoch under the main tag "loss"
                self.writer.add_scalars(main_tag='loss',
                                        tag_scalar_dict=scalars,
                                        global_step=epoch)

        if self.writer:
            # Flushes the writer
            self.writer.flush()

    def save_checkpoint(self, filename):
        # Builds dictionary with all elements for resuming training
        checkpoint = {
            'epoch': self.total_epochs,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'loss': self.losses,
            'val_loss': self.val_losses
        }
        torch.save(checkpoint, filename)

    def load_checkpoint(self, filename):
        # Loads dictionary
        checkpoint = torch.load(filename)
        # Restore state for model and optimizer
        self.model.load_state_dict(
            checkpoint['model_state_dict']
        )
        self.optimizer.load_state_dict(
            checkpoint['optimizer_state_dict']
        )
        self.total_epochs = checkpoint['epoch']
        self.losses = checkpoint['loss']
        self.val_losses = checkpoint['val_loss']
        self.model.train() # always use TRAIN for resuming training

    def predict(self, x):
        # Set it to evaluation mode for predictions
        self.model.eval()
        # Take a Numpy input and make it a float tensor
        x_tensor = torch.as_tensor(x).float()
        # Send input to device and use model for prediction
        y_hat_tensor = self.model(x_tensor.to(self.device))
        # Set it back to train mode
        self.model.train()
        # Detach it, bring it to CPU and back to Numpy
        return y_hat_tensor.detach().cpu().numpy()

    def plot_losses(self):
        fig = plt.figure(figsize=(10, 4))
        plt.plot(self.losses, label='Training Loss', c='b')
        if self.val_loader:
            plt.plot(self.val_losses, label='Validation Loss', c='r')
        plt.yscale('log')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.tight_layout()
        return fig

    def add_graph(self):
        if self.train_loader and self.writer:
            # Fetches a single mini-batch so we can use add_graph
            x_dummy, y_dummy = next(iter(self.train_loader))
            self.writer.add_graph(self.model, x_dummy.to(self.device))


In [None]:
n_epochs = 200

sbs = NLPModel(model, loss_fn, optimizer)
sbs.set_loaders(train_loader, val_loader)
sbs.set_tensorboard(name='classy')
sbs.train(n_epochs=n_epochs)


In [None]:
# One important thing to notice is that the model attribute of the sbs object
# is the same object as the model variable created in the model configuration.
# It is not a copy!
# (Using below code can easily verify this)
# print(sbs.model == model)


# Make prediction for new, never seen before data points
# new_data = None
# predictions = sbs.predict(new_data)

## Sentiment Analysis