# This script runs the Grasser logit and is intended to be uploaded to AWS to be adapted there

## Set up CUDA

In [None]:
SEED = 30255 # Specify a seed for reproducability
import random
import numpy as np
import torch
import pandas as pd
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(53113)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


Import torch and other packages

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch.utils.data.dataset import random_split
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

from collections import Counter
from csv import reader
import time
import matplotlib.pyplot as plt

## Prepare Dataloader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Different files:
1. full original test and train
2. concatenate originals, drop dups, split train_test
3. get #2 and add scraped and drop dups, then split

In [None]:
# Replace with local csv paths
import os
RPATH = './drive/MyDrive/drugproject/3_data/duplicate_exploration'
APATH = './drive/MyDrive/UChicago/courses/Spring 2021/NLP - Amitabh/drugproject/3_data/duplicate_exploration'
PATH = RPATH

TRAIN_FILE = os.path.join(PATH, "train_original.csv")
TEST_FILE =  os.path.join(PATH, "test_original.csv")
TRAIN_FILE2 = os.path.join(PATH, "train_original_clean.csv")
TEST_FILE2 =  os.path.join(PATH, "test_original_clean.csv")
TRAIN_FILE3 = os.path.join(PATH, "train_enhanced_clean.csv")
TEST_FILE3 =  os.path.join(PATH, "test_enhanced_clean.csv")


BATCH_SIZE = 64
SHUFFLE = True

In [None]:
import os
os.path.exists(TRAIN_FILE)

True

### Create vocab for BoW

In [None]:
tokenizer = get_tokenizer('basic_english')
counter_reviews = Counter()

REVIEW_COL_NUM = 2

with open(TRAIN_FILE3, 'r', encoding="utf8") as f:
    csv_reader = reader(f)
    for i, row in enumerate(csv_reader):
        tokens = tokenizer(row[REVIEW_COL_NUM])
        counter_reviews.update(tokens)
        if not i % 10000:
            print(f"{i} examples completed")

0 examples completed
10000 examples completed
20000 examples completed
30000 examples completed
40000 examples completed
50000 examples completed
60000 examples completed
70000 examples completed
80000 examples completed
90000 examples completed
100000 examples completed
110000 examples completed
120000 examples completed
130000 examples completed


In [None]:
vocab_reviews = Vocab(counter_reviews, min_freq=100)

#### Make Collate
Typical collate functions that are passed to dataloaders can only take one argument: batch.  Because we want to give ourselves flexibility to adjust tokenizers, word-embedding classes, and vocab classes, we need to be able to pass additional arguments to our collate functions.  We can get around this by create a collate function using make_collate.  See collate.ipynb for more details.  An example of using make_collate_bow is below.

In [None]:
SENTIMENT_CATEGORIES = {
    "Negative":0,
    "Neutral":1,
    "Positive":2,
    "Postive": 2
}

def make_collate_bow(vocab, tokenizer, device='cpu'):
    """
    device is torch.device object with argument 'cpu' or 'cuda'
    """

    def collate_bow(batch):
        
        labels = []
        text_tensor = None
        for (l, t) in batch:
            labels.append(SENTIMENT_CATEGORIES[l])

            row = torch.zeros(len(vocab)) # should vocab be a parameter to the function?      
            for token in tokenizer(t):
                index = vocab.stoi[token] 
                row[index] += 1
            if text_tensor is not None: # boolean eval of tensors is ambiguous so I have to be more explicit
                text_tensor = torch.cat([text_tensor, row.unsqueeze(0)], 0)
            else:
                text_tensor = row.unsqueeze(0)
            
        sums = torch.sum(text_tensor, 1)  
        text_tensor = text_tensor / sums.unsqueeze(1)
    
        return torch.tensor(labels).to(device), text_tensor.to(device)        


    return collate_bow

collate_fn = make_collate_bow(vocab_reviews, tokenizer, device='cuda')

## Data viz function

In [None]:
import altair as alt
import pandas as pd

def training_lines(training_accuracies, validation_accuracies, test_accuracy, title=None):
  '''
  Inputs:
  training_accuracies: list of length n of the training accuracies obtained at each epoch
  validation_accuries: list of length n of the trianing accuracies obtained at each epoch
  test_accuracy: integer of the highest test score attained
  '''
  epochs = list(map(lambda accs: accs+1, range((len(training_accuracies)))))
  train_df= pd.DataFrame({"accuracy":training_accuracies, "epoch":epochs})
  train_df["Dataset"] = "Training accuracy"
  valid_df = pd.DataFrame({"accuracy":validation_accuracies, "epoch":epochs})
  valid_df["Dataset"] = "Validation accuracy"
  test = [test_accuracy]*len(epochs)
  test_df =pd.DataFrame({"epoch":epochs, "accuracy":test})
  test_df['Dataset'] = "Optimum Test accuracy"
  #df = pd.concat([train_df, valid_df, test_df])
  df = pd.concat([train_df, valid_df])
  df['epoch']= df['epoch'].apply(lambda ep: int(ep))
  min_y_axis = min(min(training_accuracies),min(validation_accuracies), test_accuracy) - 0.1

  line_chart = alt.Chart(df).mark_line(point=True).encode(
      x=alt.X("epoch:Q", axis=alt.Axis(tickMinStep=1)),
      y=alt.Y("accuracy", scale=alt.Scale(domain=[min_y_axis, 1])),
      color='Dataset'
  )

  test_line = alt.Chart(test_df).mark_line(strokeDash=[7,1]).encode(
      x=alt.X("epoch:Q", axis=alt.Axis(tickMinStep=1)),
      y=alt.Y("accuracy", scale=alt.Scale(domain=[min_y_axis, 1])),
      color='Dataset'
  )

  return (line_chart + test_line).configure_point(size=10).properties(title={'text': (title if title else "")})

## Dataloader

In [None]:
class DrugReviewDataset(Dataset):
    def __init__(self, csv_file, x_colname, target_colname):
        """
        the following are assumed about csv_file:
            - headers are in first row
            - there is a column called 'date'
            - there is a column called 'review' which contains the text data
        """
        self.x = []
        self.target = []
        with open(csv_file, 'r', encoding="utf8") as f:
            data = list(reader(f))

        target_colnum = data[0].index(target_colname)
        x_colnum = data[0].index(x_colname)
        for row in data[1:]:
            self.x.append(row[x_colnum])
            self.target.append(row[target_colnum])

        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        """
        idx can be a list or tensor if integers
        """
        example = (self.target[idx], self.x[idx])
        
        return example

def get_dataloader(data_file, batch_size, shuffle, collate=None, split=None,
                   train_sample=None, valid_sample=None):
    """
    datafile: path to input file (should be a csv)
    batch_size: (int) parameter for DataLoader class
    shuffle: (bool) parameter for DataLoader class
    collage: (fn) parameter for DataLoader class
    split: (bool) specifies if there is to be a train-validation split on data
    """

    ds = DrugReviewDataset(data_file, 'review', 'rating_category')
    # ds.target = target_name
    if split:
      train_dl = DataLoader(ds, batch_size=batch_size, shuffle=False, 
                              collate_fn=collate, sampler=train_sample)  
      valid_dl = DataLoader(ds, batch_size=batch_size, shuffle=False, 
                              collate_fn=collate, sampler=valid_sample)
      return train_dl, valid_dl
    
    dataloader = DataLoader(ds, batch_size=batch_size, shuffle=shuffle, collate_fn=collate)
    
    return dataloader



Split train and validation

In [None]:
import numpy as np
import pandas as pd
validation_split = 0.2
dataset_size = len(pd.read_csv(TRAIN_FILE))
indices = list(range(dataset_size))
split = int(np.floor(validation_split*dataset_size))
train_indices, valid_indices = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(valid_indices)

In [None]:
train_dataloader, valid_dataloader = get_dataloader(TRAIN_FILE, BATCH_SIZE, SHUFFLE,
                                                    collate_fn, split=True, 
                                                    train_sample=train_sampler,
                                                    valid_sample=valid_sampler)

In [None]:
test_dataloader = get_dataloader(TEST_FILE, BATCH_SIZE, SHUFFLE, collate_fn)

In [None]:
loss_function = torch.nn.CrossEntropyLoss()

def train_an_epoch(optimizer, model, dataloader, loss_function=loss_function):
    model.train()
    log_interval = 500

    for idx, (label, text) in enumerate(dataloader):
        model.zero_grad()
        log_probs = model(text)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        if idx % log_interval == 0 and idx > 0:
            print(f'At iteration {idx} the loss is {loss:.3f}.')


def get_accuracy(model, dataloader):
    model.eval()
    with torch.no_grad():    
        total_acc, total_count = 0, 0
        for idx, (label, word_idxs) in enumerate(dataloader):
            log_probs = model.forward(word_idxs)
            total_acc += (log_probs.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

## LOGIT

In [None]:
# Step 3. Create Model Class
class LogisticRegression(torch.nn.Module):
    
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [None]:
for idx, (lt, tt) in enumerate(train_dataloader):
    print(lt.shape, tt.shape)
    if idx == 4: 
        input_d = tt.shape[1]
        break

torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])


In [None]:
# Step 4. Instantiate Model Class
batch_size = 64
n_iters = 3000
EPOCHS = 200
input_dim = input_d 
output_dim = 3 # number of classes to predict 
lr_rate = 0.001

In [None]:
# Step 5. Instantiate Model and Loss Class
model = LogisticRegression(input_dim, output_dim).to(device)
loss_function = torch.nn.CrossEntropyLoss() # computes softmax and then the cross entropy
# Step 6. Instantiate Optimizer Class
optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)

In [None]:
train_accuracies = []
valid_accuracies=[]
test_accuracies=[]
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train_an_epoch(optimizer, model, train_dataloader, loss_function)
    t_accuracy = get_accuracy(model, train_dataloader)
    train_accuracies.append(t_accuracy)
    print("Train accuracy:", t_accuracy)
    v_accuracy = get_accuracy(model, valid_dataloader)
    valid_accuracies.append(v_accuracy)
    test_accuracy = get_accuracy(model, test_dataloader)
    print("Test set accuracy is:{}".format(test_accuracy))
    test_accuracies.append(test_accuracy)
    time_taken = time.time() - epoch_start_time
    print(f'Epoch: {epoch}, time taken: {time_taken:.1f}s, validation accuracy: {v_accuracy:.3f}.')

At iteration 500 the loss is 0.909.
At iteration 1000 the loss is 0.838.
At iteration 1500 the loss is 0.880.
At iteration 2000 the loss is 0.750.
Train accuracy: 0.6626342627753065
Test set accuracy is:0.6593200163672209
Epoch: 1, time taken: 327.6s, validation accuracy: 0.663.
At iteration 500 the loss is 0.768.
At iteration 1000 the loss is 0.800.
At iteration 1500 the loss is 0.865.
At iteration 2000 the loss is 0.767.
Train accuracy: 0.6651838993164804
Test set accuracy is:0.6613845180969385
Epoch: 2, time taken: 326.5s, validation accuracy: 0.665.
At iteration 500 the loss is 0.815.
At iteration 1000 the loss is 0.821.
At iteration 1500 the loss is 0.802.
At iteration 2000 the loss is 0.801.
Train accuracy: 0.6705931586044421
Test set accuracy is:0.6667596622400773
Epoch: 3, time taken: 329.6s, validation accuracy: 0.670.
At iteration 500 the loss is 0.817.
At iteration 1000 the loss is 0.843.
At iteration 1500 the loss is 0.787.
At iteration 2000 the loss is 0.666.
Train accurac

Epoch 194 Test set accuracy is:0.7815719971729346, begins to fall after

In [None]:
training_lines(train_accuracies, valid_accuracies, max(test_accuracies), "Logistic Regression with original dataset")

Second Model using original dataset without duplicates

In [None]:
dataset_size = len(pd.read_csv(TRAIN_FILE2))
indices = list(range(dataset_size))
split = int(np.floor(validation_split*dataset_size))
train_indices, valid_indices = indices[split:], indices[:split]
train_sampler2 = SubsetRandomSampler(train_indices)
valid_sampler2 = SubsetRandomSampler(valid_indices)

In [None]:
train_dataloader2, valid_dataloader2 = get_dataloader(TRAIN_FILE2, BATCH_SIZE, SHUFFLE,
                                                    collate_fn, split=True, 
                                                    train_sample=train_sampler2,
                                                    valid_sample=valid_sampler2)
test_dataloader2 = get_dataloader(TEST_FILE2, BATCH_SIZE, SHUFFLE, collate_fn)

In [None]:
for idx, (lt, tt) in enumerate(train_dataloader2):
    print(lt.shape, tt.shape)
    if idx == 4: 
        input_d = tt.shape[1]
        break
input_dim = input_d 

torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])


started 16:09. If one epoch takes 205 minutes, then 11.5 hours total. Finish at 3:39

In [None]:
model2 = LogisticRegression(input_dim, output_dim).to(device)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=lr_rate)
loss_function = torch.nn.CrossEntropyLoss()
EPOCHS = 200
max_test_acc = 0

train_accuracies2 = []
valid_accuracies2=[]
test_accuracies2=[]
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train_an_epoch(optimizer2, model2, train_dataloader2, loss_function)
    t_accuracy = get_accuracy(model2, train_dataloader2)
    train_accuracies2.append(t_accuracy)
    v_accuracy = get_accuracy(model2, valid_dataloader2)
    valid_accuracies2.append(v_accuracy)
    test_accuracy = get_accuracy(model2, test_dataloader2)
    print("Test set accuracy is:{}".format(test_accuracy))
    if test_accuracy > max_test_acc:
        max_test_acc = test_accuracy
        print("Epoch {} new highest test acc".format(epoch))
    test_accuracies2.append(test_accuracy)
    time_taken = time.time() - epoch_start_time
    print(f'Epoch: {epoch}, time taken: {time_taken:.1f}s, validation accuracy: {v_accuracy:.3f}.')

At iteration 500 the loss is 0.784.
At iteration 1000 the loss is 0.875.
Test set accuracy is:0.662359900373599
Epoch 1 new highest test acc
Epoch: 1, time taken: 203.0s, validation accuracy: 0.664.
At iteration 500 the loss is 0.894.
At iteration 1000 the loss is 0.905.
Test set accuracy is:0.662640099626401
Epoch 2 new highest test acc
Epoch: 2, time taken: 204.3s, validation accuracy: 0.665.
At iteration 500 the loss is 0.994.
At iteration 1000 the loss is 0.747.
Test set accuracy is:0.6635429638854297
Epoch 3 new highest test acc
Epoch: 3, time taken: 204.9s, validation accuracy: 0.666.
At iteration 500 the loss is 0.858.
At iteration 1000 the loss is 0.699.
Test set accuracy is:0.6665940224159402
Epoch 4 new highest test acc
Epoch: 4, time taken: 206.3s, validation accuracy: 0.669.
At iteration 500 the loss is 0.734.
At iteration 1000 the loss is 0.816.
Test set accuracy is:0.6701743462017434
Epoch 5 new highest test acc
Epoch: 5, time taken: 207.2s, validation accuracy: 0.673.
At

Epoch 200 Test set accuracy is:0.7752490660024907

In [None]:
training_lines(train_accuracies2, valid_accuracies2, max(test_accuracies2), "Logistic Regression with original cleaned dataset")

Third model with cleaned dataset and newly scraped data

In [None]:
dataset_size = len(pd.read_csv(TRAIN_FILE3))
indices = list(range(dataset_size))
split = int(np.floor(validation_split*dataset_size))
train_indices, valid_indices = indices[split:], indices[:split]
train_sampler3 = SubsetRandomSampler(train_indices)
valid_sampler3 = SubsetRandomSampler(valid_indices)

In [None]:
train_dataloader3, valid_dataloader3 = get_dataloader(TRAIN_FILE3, BATCH_SIZE, SHUFFLE,
                                                    collate_fn, split=True, 
                                                    train_sample=train_sampler3,
                                                    valid_sample=valid_sampler3)
test_dataloader3 = get_dataloader(TEST_FILE3, BATCH_SIZE, SHUFFLE, collate_fn)

In [None]:
for idx, (lt, tt) in enumerate(train_dataloader3):
    print(lt.shape, tt.shape)
    if idx == 4: 
        input_d = tt.shape[1]
        break
input_dim = input_d 

torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])
torch.Size([64]) torch.Size([64, 4683])


started 6:40 ish

In [None]:
model3 = LogisticRegression(input_dim, output_dim).to(device)
optimizer3 = torch.optim.Adam(model3.parameters(), lr=lr_rate)
EPOCHS = 200
max_test_acc = 0

train_accuracies3 = []
valid_accuracies3 = []
test_accuracies3 = []
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train_an_epoch(optimizer3, model3, train_dataloader3, loss_function)
    t_accuracy = get_accuracy(model3, train_dataloader3)
    train_accuracies3.append(t_accuracy)
    v_accuracy = get_accuracy(model3, valid_dataloader3)
    valid_accuracies3.append(v_accuracy)
    test_accuracy = get_accuracy(model3, test_dataloader3)
    print("Test set accuracy is:{}".format(test_accuracy))
    if test_accuracy > max_test_acc:
        max_test_acc = test_accuracy
        print("Epoch {} new highest test acc".format(epoch))
    test_accuracies3.append(test_accuracy)
    time_taken = time.time() - epoch_start_time
    print(f'Epoch: {epoch}, time taken: {time_taken:.1f}s, validation accuracy: {v_accuracy:.3f}.')

At iteration 500 the loss is 0.865.
At iteration 1000 the loss is 0.760.
At iteration 1500 the loss is 0.774.
Test set accuracy is:0.6199351351351351
Epoch 1 new highest test acc
Epoch: 1, time taken: 301.5s, validation accuracy: 0.622.
At iteration 500 the loss is 0.963.
At iteration 1000 the loss is 0.775.
At iteration 1500 the loss is 0.867.
Test set accuracy is:0.623372972972973
Epoch 2 new highest test acc
Epoch: 2, time taken: 316.9s, validation accuracy: 0.625.
At iteration 500 the loss is 0.843.
At iteration 1000 the loss is 0.754.
At iteration 1500 the loss is 0.885.
Test set accuracy is:0.6328648648648648
Epoch 3 new highest test acc
Epoch: 3, time taken: 319.0s, validation accuracy: 0.634.
At iteration 500 the loss is 0.854.
At iteration 1000 the loss is 0.736.
At iteration 1500 the loss is 0.746.
Test set accuracy is:0.6469621621621622
Epoch 4 new highest test acc
Epoch: 4, time taken: 321.4s, validation accuracy: 0.648.
At iteration 500 the loss is 0.872.
At iteration 1000

In [None]:
 training_lines(train_accuracies3, valid_accuracies3, max(test_accuracies3), "Logistic Regression with cleaned and enhanced dataset")

Highest test set accuracy epoch 161: 0.76