# CSE 325/425 NLP
### Programming Project 1

You are asked to implement basic text pre-procssing and then define and train the Glove model. I removed parts of my implementation and you will need to complete them.

Your codes are evaluated based on

*   Correct and reasonable text preprocessing.
*   Convergence of the model training.
*   Speeding up of the word indexing and model training.



In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
# the following working directory should contain small.csv and glove.6B.300d.txt
os.chdir('/content/drive/My Drive/Teaching/teaching at Lehigh/2021_sp_nlp/Project 1/data/')

ModuleNotFoundError: No module named 'google.colab'

## Text Preprocessing and Dataset Construction

### Define the WordIndexer class to
*   hold the mapping from words to their indices and the indices to words.
*   map from a list of sentences to a list of integers so that words are mapped to their indices, in the same order as the original words (except some words replaced).

### Inherit from the `torch.utils.data.Dataset` class and create the AmazonReviewGloveDataset class to


*   load the Amazon reviews in the csv format. Tokenize the review texts into sentences (a review can contain more than one sentence).
*   use the WordIndexer class to obtain the indices of the words in the sentences.
*   compute the X (word co-occurrence) matrix as the Glove paper indicates.

We provide the function to read the pretrained word vectors from text files.

In [None]:
import re
from collections import Counter
import pickle

from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

class WordIndexer:
    """Transform a dataset of text to a list of index of words."""

    def __init__(self, min_word_occurences=10, oov_word="OOV"):
        """ min_word_occurrences: integer, the minimum frequency of the word to keep.
            oov_word: string, a special string for out-of-vocabulary words.
        """
        self.oov_word = oov_word
        self.min_word_occurences = min_word_occurences
        # word to integer index mapping
        self.word_to_index = {oov_word: 0}
        # the inverse of the above mapping
        self.index_to_word = [oov_word]
        # this is for storing the word frequencies for removing infrequent words
        self.word_occurrences = {}
        # regular expression for retaining meaningful English words
        self.re_words = re.compile(r"\b[a-zA-Z]{2,}\b")

    def get_word_index(self, word, add_new_word = True):
        """ Find the index of a word.
                
            word: string, the query word.
            add_new_word: if true and the word has no entry, assign a new integer index to word.
                            if false, return the index of the oov_word
        """
        ### Your codes go here (10 points) ###

    @property
    def n_words(self):
        """ return: the vocabulary size
        """
        return len(self.word_to_index)

    def fit_transform(self, texts):
        """ texts: list of sentences, each of which is a string
            
            Split each sentence into a list of words.
            Then filter out the infrequent words.
            Other text preprocessing, such as
                lower-casing,
                stop-word removal, and
                advance word tokenization
                are possible here.
            Lastly setup the word-to-index and index-to-word dictionaries.
            
            return: a list of lists of indices of words in each sentence.
                    For example: [[1,2,3], [4,5,6]] where,
                        [1,2,3] are the indices of words in the first sentence
                        [4,5,6] are the indices of words in the second sentence
                    
        """
        
        # Step 1: Obtain list of lists of words. Lower-casing and tokenization happen here.
        ### Your codes go here (10 points) ###


        # Step 2: Build a dictionary using the Counter class
        # keep the unique words and their counts
        # filter out the infrequent ones using the threshold self.min_word_occurences.
        # the results is a vocabulary in self.word_to_index and self.index_to_word.
        ### Your codes go here (10 points) ###


        # save the word and their counts to a file.
        with open('./train_word_counts.txt', 'w') as out_f:
            a = sorted([(word, count) for word, count in word_occurrences.items()],
                   key = lambda x:x[1], reverse=True)
            for word, count in a:
                out_f.write('{}:{}\n'.format(word, count))

        # Step 3: build and return the corpus in index representation
        # using the vocabulary built in the last step.
        # Be careful about words that are not in the vocabulary.
        ### Your codes go here (10 points) ###
    
class AmazonReviewGloveDataset(Dataset):
    def __init__(self, path, right_window = 4, min_word_occurences = 10):
        """ Load the reviews from a csv file. One row is one review.
                
            path: path to the csv file containing the reviews and their ratings
            right_window: integer, how large the window is to get context words.
            min_word_occurrences: integer, the minimum frequency of the word to keep.

            No return value
        """
        self.right_window = right_window
        
        # Step 1: tokenize the first field of each row in the csv file into sentences
        #         (e.g. using nltk.tokenize.sent_tokenize).
        #           Use pandas.read_csv to load the given training csv file.
        df = pd.read_csv(path)
        texts = []  # each element of texts is a single sentence.
        ### Your codes go here (10 points) ###
        

        print ('{} reviews loaded. {} sentences.'.format(df.shape[0], len(texts)))
        
        
        # Step 2: pass the list of all sentences from step 1 (texts) to WordIndexer.
        # Use its fit_transform function to turn list of sentences into list of lists of word indices in the sentences.
        # Keep the word ordering.
        print ('Indexing the corpus...')
        self.indexer = WordIndexer(min_word_occurences=min_word_occurences)
        corpus = self.indexer.fit_transform(texts, use_existing_indexer = False)
        print ('Done indexing the corpus.')
        
        
        # Step 3: go through the results (corpus) from step 2 and gather (center, context) in comatrix,
        # which is a collections.Counter object.
        # In the Counter, keys are (center, context) pairs
        # values are the number of their co-occurrence as defined in the Glove paper.
        print ('Constructing the co-occurrence matrix...')
        comatrix = Counter()
        ### Your codes go here (10 points) ###


        # save the comatrix to file                    
        with open('./comatrix.pkl', 'wb') as out_f:
            pickle.dump(comatrix, out_f)

        # Step 4: flatten the co-occurrence matrix and store the center, context, and X_ij
        # in three lists: self.left (center word), self.right (context word), self.n_occurrences (X_ij)
        self.left, self.right, self.n_occurrences = None, None, None
        ### Your codes go here (10 points) ###
        
    def __getitem__(self, index):
        return self.left[index], self.right[index], self.n_occurrences[index]
    
    def __len__(self):
        return len(self.left)
    
def load_pretrained_wv(path):
    """
        Load the pretrained word vectors downloaded from Stanford NLP.
    """
    wv = {}
    with open(path, 'r') as f:
        for line in f.readlines():
            items = line.split(' ')
            wv[items[0]] = torch.DoubleTensor([float(a) for a in items[1:]])
    return wv

## Define the Glove model
The parameters include

*   Vectors of words when used as center and context words

The parameters are defined for you already and please don't change the variable names.

There is an option to pass in pre-trained word vectors to replace random initialization of the word vectors in this model.

You have to complete the forward function to compute the predictions of log X_ij.



In [None]:
from torch import nn
import torch.functional as F

class GloveModel(nn.Module):
    def __init__(self, word_indexer, wv = None, word_dims = 300, BASE_STD = 0.01, random_state = 0):
        """ Specify and initialize the parameters of the Glove network.
        """
        super(GloveModel, self).__init__()
        num_words = word_indexer.n_words
        
        torch.manual_seed(random_state)
        
        # initialize left and right word vectors
        self.L_vecs = (torch.randn((num_words, word_dims))  * BASE_STD)
        self.R_vecs = (torch.randn((num_words, word_dims))  * BASE_STD)
       
        if wv is not None:
            num_replaced = 0
            for i in range(num_words):
                word = word_indexer.index_to_word[i]
                if word in wv:
                    num_replaced += 1
                    self.L_vecs[i] = wv[word]
                    self.R_vecs[i] = wv[word]
            print (f'Replaced {float(num_replaced) / num_words}')
            
        self.L_vecs.requires_grad_()
        self.R_vecs.requires_grad_()
        
        # gather the trainable parameters
        self.parameters = [self.L_vecs, self.R_vecs]
        
    def forward(self, left_indices, right_indices):
        """ Implement w_i^t w_j (the left-hand-side of Eq. (16) in the Glove paper)
        
            left_indices: torch.Tensor, a batch of center words
            right_indices: torch.Tensor, a batch of context words, of the same shape of left_indices.
            
            left_indices[i] and right_indices[i] is the i-th pair in the training data.
            
            return: torch.Tensor of the same shape of left_indices
        """
        ### Your codes go here (10 points) ###

## Model training, validating, and saving

### First define some constants




In [None]:
import torch
from tqdm import tqdm

# this will automatically place all tensor on GPU with type Double.
# if you are not running on GPU, change this line to
# torch.set_default_tensor_type('torch.DoubleTensor')
torch.set_default_tensor_type('torch.cuda.DoubleTensor')

# set up a couple of parameters and hyper-parameters

# number of epoches to train the model
NUM_EPOCH = 25
# size of mini-batches
BATCH_SIZE = 512

# dimension of word vectors. The integer should be the same as the dimension of
# pretrained word vectors.
NUM_DIMS = 300

# how to many words to the right to pair with the center word
WINDOW_SIZE = 10

# two hyper-parameters in Eq. (9) of the paper
x_max = 100
alpha = 0.75

# input file containing Amazon review texts.
train_path = './small.csv'

# where your model is saved.
save_path = './glove_model_{}.pt'

# optional word vectors pretrained
pretrained_wv = './glove.6B.{}d.txt'.format(NUM_DIMS)
print (pretrained_wv)

In [None]:
# load pretrained word vectors
wv = load_pretrained_wv(pretrained_wv)

In [None]:
print (wv['good'])

### Then define the training, validation, and test data.
*   Use the AmazonReviewGloveDataset class to read train dataset.
*   Define DataLoader wrapping around the Dataset objects



In [None]:
# load text data and turn them into a DataLoader object.
train_dataset = AmazonReviewGloveDataset(train_path, right_window = WINDOW_SIZE)
train_iter = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)

### Third, start training.

*   You're required to use GPU to train the network, since GPU are ubiquitous (colab or SandBox).
*   Complete the function train_and_validate.



In [None]:
# decide whether to use cpu or gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# initialize the Glove model
model = GloveModel(train_dataset.indexer, wv, word_dims = NUM_DIMS)

# make sure you use weight_decay to activate the L2 regularization
optimizer = torch.optim.Adam(model.parameters, weight_decay=1e-8)

def train_and_validate(train_iter):
    best_loss = -1
    best_epoch = -1
    to_save = {}
    
    for epoch in range(NUM_EPOCH):
        model.train()
        epoch_loss = 0
        num_batches = len(train_iter)
        for l, r, n_lr in train_iter:
            optimizer.zero_grad()
            
            # Implement the loss function in Eq. (16) of the paper, in three steps.
            # Step 1. find the prediction of log(X_ij) using the model 
            ### Your codes go here (3 points) ###


            # Step 2. compute the weights f(X_ij). See Eq. (9) of the Glove paper.
            ### Your codes go here (3 points) ###


            # Step 3. compute the loss in Eq. (16) using the predictions and the weights
            ### Your codes go here (4 points) ###


            # tracking the averaged loss
            epoch_loss += loss.item()
            
            # gradient descent, don't change the following two lines
            loss.backward()
            optimizer.step()
        print(f'Training epoch = {epoch}, epoch loss = {epoch_loss / num_batches}')

        # record the model state_dict() for saving later
        to_save = {
            'epoch': epoch,
            'model_state_dict': model.state_dict()
        }
        torch.save(to_save, save_path.format(epoch))
        print (save_path.format(epoch))
    
train_and_validate(train_iter, valid_iter = None)

## Retrieve similar words

In [None]:
from sklearn.preprocessing import normalize

test_aspect_words = ['phone', 'case', 'battery', 'headset', 'charger', 'quality', 'screen', 'bluetooth', 'price', 'device']
test_sentimental_words = ['great', 'good', 'well', 'works', 'better', 'little', 'easy', 'nice', 'new', 'long']

glove = load_model(save_path.format(0), train_dataset.indexer)
avg_word_vectors = (glove.L_vecs.to('cpu') + glove.R_vecs.to('cpu')) / 2
avg_word_vectors = avg_word_vectors.detach().numpy()

n_words = train_dataset.indexer.n_words

row_normalized = normalize(avg_word_vectors)
sim = row_normalized.dot(row_normalized.T)

for w in test_aspect_words:
    w_idx = train_dataset.indexer.word_to_index[w]
    l = []
    for i in range(n_words):
        l.append((i, sim[w_idx, i]))
    l = sorted(l, key = lambda x:x[1], reverse = True)
    for i in range(10):
        print (f'{train_dataset.indexer.index_to_word[l[i][0]]}: {l[i][1]}')
        
for w in test_sentimental_words:
    w_idx = train_dataset.indexer.word_to_index[w]
    l = []
    for i in range(n_words):
        l.append((i, sim[w_idx, i]))
    l = sorted(l, key = lambda x:x[1], reverse = True)
    for i in range(10):
        print (f'{train_dataset.indexer.index_to_word[l[i][0]]}: {l[i][1]}')