# Lab 6
# Stephen and Zahra
# 3/30/23

Download pre-trained Word2Vec model


In [1]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

corpus = api.load('text8')
model = Word2Vec(corpus)



In [14]:
import matplotlib.pyplot as plt

Getting embedding of a sentence

In [2]:
import numpy as np

def get_sentence_embedding(model, text):
  # This method takes in the trained model and the input sentence
  # and returns the embedding of the sentence as the average embedding
  # of its words
  words = text.split(" ")
  count = 0
  for i in range(1, len(words)):
    try:
      if count == 0:
        vector = model.wv[words[i]]
      else: 
        vector = np.copy(vector+model.wv[words[i]])
      count+=1
    except:
      continue
  return vector/count

# Sample code to extract vector for a sentence
get_sentence_embedding(model, "test text embedding")

array([ 0.7832766 , -0.96575844,  0.47220847, -0.3736778 , -0.02325084,
       -0.7958983 ,  0.4800855 , -0.68430334,  0.01864947,  0.25354746,
        0.4675374 , -0.9785528 , -1.2125181 , -0.6186081 , -0.5388376 ,
        1.8834221 ,  1.7704828 , -1.0579275 ,  1.2803311 , -0.7503278 ,
       -1.9556098 , -0.7494621 ,  0.30723435, -0.5918319 , -0.2440902 ,
        0.6957924 , -1.1008555 , -0.8925937 , -0.52666163, -0.6116081 ,
        0.3110509 ,  0.19051307, -0.24196912, -0.22351381,  0.14732872,
        1.7896966 ,  0.39720222, -0.67234474, -0.12932627,  1.011306  ,
       -0.48131087, -1.2953782 , -0.52418375,  0.43842018, -0.27650204,
       -0.92612255, -0.5635319 , -0.2362394 , -1.7477231 , -0.7285474 ,
        0.74319345,  0.27074164, -0.6248669 , -0.6571851 , -1.0153799 ,
       -1.1947662 ,  0.2783715 , -0.05961734,  0.9347939 ,  0.06033426,
        0.43768364, -0.6868789 ,  0.91989005,  1.1135072 ,  0.42156565,
       -0.6620511 ,  0.7229523 ,  0.8793481 ,  0.08634901,  0.49

Reading TSV file and saving embeddings

In [3]:
import pandas as pd

def read_tweets_get_vectors(tweet_file_path):
  # This method takes in the file path for the twitter file, and return a
  # dicationary of dictionaries. In the first dictionary the keys are the
  # tweet labels (3 classes), and the values are another dictionary with
  # tweet id as the key and values are tuple of (vector, tweet text)
    df = pd.read_csv(tweet_file_path, sep=',', header=0)
    dic_result = {}
    df1 = df[['tweet_id', 'text', 'airline_sentiment']]
    for index in range(len(df1)):
        try:
            vetor_rep = get_sentence_embedding(model, df.loc[index, "text"].lower())
            label = df.loc[index, "airline_sentiment"]
            tweet_id = df.loc[index, "tweet_id"]
            if label in dic_result:
                dic_result[label][tweet_id] = (vetor_rep, df.loc[index, "text"].lower())
            else:
                dic_result[label] = {tweet_id: (vetor_rep, df.loc[index, "text"].lower())}
        except:
            pass
    return dic_result

twitter_data = read_tweets_get_vectors("Tweets.csv")
for key in twitter_data.keys():
  print(key + "\t\t number of instances: " + str(len(twitter_data[key])))

neutral		 number of instances: 3005
positive		 number of instances: 2263
negative		 number of instances: 8997


Code to generate training, validation, and test sets

In [4]:
import random

def split_data(twitter_data):
  # takes in the dictionary from the previous step and generate
  # the training, validation, and test sets. Note that the labels 
  # are represented as one-hot codings.
    training_x = []
    training_y = []

    validation_x = []
    validation_y = []

    test_x = []
    test_y = []

    for label in twitter_data:

        # labels are indicated as one hot coding [negative, neutral, positive]
        if label == "negative":
            n_label = [1, 0, 0]
        elif label == "neutral":
            n_label = [0, 1, 0]
        else:
            n_label = [0, 0, 1]
        temp_dic = twitter_data[label]
        lst_tweet_ids = list(temp_dic.keys())
        #### Splitting by 80-10-10
        ## Note that you could alternatively use sklearn split method
        train_length = int(len(lst_tweet_ids)*0.8)
        train_ids = lst_tweet_ids[ :train_length]
        remaining = lst_tweet_ids[train_length:]
        test_lenght = int(len(remaining)*0.5)
        test_ids = remaining[:test_lenght]
        validation_id = remaining[test_lenght:]

        for tweet_id in train_ids:
            training_x.append(temp_dic[tweet_id][0])
            training_y.append(n_label)
        for tweet_id in validation_id:
            validation_x.append(temp_dic[tweet_id][0])
            validation_y.append(n_label)
        for tweet_id in test_ids:
            test_x.append(temp_dic[tweet_id][0])
            test_y.append(n_label)

    # The reason we apply this shuffling is to make sure 
    # when passing batches to the network, we see different items 
    c = list(zip(training_x, training_y))
    random.shuffle(c)
    training_x, training_y = zip(*c)

    return training_x, training_y, validation_x, validation_y, test_x, test_y

# Sample usage
training_x, training_y, validation_x, validation_y, test_x, test_y = split_data(twitter_data)

Here goes your code for your Feedfoward network Design

In [5]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch


class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Define layers

        self.layer_1 = nn.Linear(input_dim, hidden_dim_1)

        self.relu_1 = nn.ReLU()

        self.layer_2 = nn.Linear(hidden_dim_1, hidden_dim_2)

        self.relu_2 = nn.ReLU()

        self.layer_3 = nn.Linear(hidden_dim_2, output_dim)

    def forward(self, x):
        # Your network forward pass

        # modify this line

        out = self.layer_1(x)
        out = self.relu_1(out)

        out = self.layer_2(out)
        out = self.relu_2(out)

        out = self.layer_3(out)

        return torch.sigmoid(out)

In [15]:
def plot_train_val_losses(train_losses, val_losses):
    # Plot the training and validation losses
    plt.plot(train_losses, label='Train')
    plt.plot(val_losses, label='Validation')
    plt.title('Training and Validation Losses')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

Training the network
Define a class, with properties such as size of hidden layers
loss function, optimizer, training method, test method, and accuracy

In [19]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# here goes your parameters
# sample code to define your model 
#model = FeedforwardNeuralNetModel(input_dim, hidden_dim_1, out_dim)
#model.to(devide)
input_dim = 100
hidden_dim_1 = 16
hidden_dim_2 = 32
hidden_dim_3 = 32
output_dim = 3
num_epochs = 10000
model = FeedforwardNeuralNetModel(input_dim, hidden_dim_1, hidden_dim_2, 
                                  hidden_dim_3, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1e-3)
model.to(device)
criterion = criterion.to(device)


def calculate_accuracy(y_true, y_pred):
  # this method will be used to calculate the accuracy of your model
    correct = (y_true.argmax (dim = 1) == y_pred.argmax (dim = 1)).float()
    acc = correct.sum() / len(correct)
    return acc
def training(tfidfX_train, Y_train, tfidfX_val, Y_val):
  # this method will be used for training your model
  # inputs are the training and validation sets
  # You can define batch size of your choice
  batch_size = 2000
  #print(type(tfidfX_train))
  X_train_mini_batches = torch.split(tfidfX_train, batch_size)
  Y_train_mini_batches = torch.split(Y_train, batch_size)
  train_losses = []
  train_accuracies = []
  val_losses = []
  val_accuracies = []
  best_accuracy = 0
  for epoch in tqdm(range(num_epochs)):
      epoch_loss = 0
      epoch_accuracy = 0
      validation_loss = 0
      val_accuracy = 0
      for X_train_mini_batch, Y_train_mini_batch in zip(X_train_mini_batches, Y_train_mini_batches):
          X_train_mini_batch = X_train_mini_batch.to(device)
          Y_train_mini_batch = Y_train_mini_batch.to(device)
          # Continue code here to train the network
          # here check your validation set
          # you have to save the model with the best loss or maybe accuracy?
          train_prediction = model.forward(X_train_mini_batch.float())
          train_prediction = torch.squeeze(train_prediction)
          train_loss = criterion(train_prediction, Y_train_mini_batch.float())
          optimizer.zero_grad()
          train_loss.backward()
          optimizer.step()
          epoch_loss += train_loss.item()
          epoch_accuracy += calculate_accuracy(Y_train_mini_batch, train_prediction)

      tfidfX_val = tfidfX_val.to(device)
      Y_val = Y_val.to(device)
      val_prediction = model.forward(tfidfX_val.float())
      val_prediction = torch.squeeze(val_prediction)
      val_loss = criterion(val_prediction, Y_val.float())
      validation_loss = val_loss.item()
      val_accuracy = calculate_accuracy(Y_val, val_prediction)
      if val_accuracy > best_accuracy:
        torch.save(model.state_dict(), "best_model_state.bin")
        best_accuracy = val_accuracy
      epoch_loss /= len(X_train_mini_batches)
      epoch_accuracy /= len(X_train_mini_batches)
      val_losses.append(validation_loss)
      train_losses.append(epoch_loss)
      train_accuracies.append(epoch_accuracy)

      #plot_train_val_losses(train_losses, val_losses)

      val_accuracies.append(val_accuracy)

In [20]:
tfidfX_train = torch.tensor(training_x)
Y_train = torch.tensor(training_y)
tfidfX_val = torch.tensor(validation_x)
Y_val = torch.tensor(validation_y)

print(tfidfX_train.shape)
print(Y_train.shape)
training(tfidfX_train, Y_train, tfidfX_val, Y_val)

torch.Size([11411, 100])
torch.Size([11411, 3])


100%|██████████| 10000/10000 [02:46<00:00, 60.05it/s]


In [11]:
from sklearn.metrics import classification_report
def test(tfidfX_test, Y_test):
  tfidfX_test = tfidfX_test.to(device)
  Y_test = Y_test.to(device)

  #print(tfidfX_test.shape)
  #print(Y_test.shape)

  test_prediction = model.forward(tfidfX_test.float())
  test_prediction = torch.squeeze(test_prediction)

  test_accuracy = calculate_accuracy(Y_test, test_prediction)

  print("Test accuracy:", round(test_accuracy.item(),4), "\n")

  #test_prediction = test_prediction.to(device)
  #test_prediction = test_prediction.ge(.5).view(-1).cpu()
  Y_test = Y_test.cpu()

  a = test_prediction.argmax(dim=1).numpy()
  b = Y_test.argmax(dim=1).numpy()

  print(classification_report(b, a))

In [12]:
tfidfX_test = torch.tensor(test_x)
Y_test = torch.tensor(test_y)

test(tfidfX_test, Y_test)

Test accuracy: 0.6311 

              precision    recall  f1-score   support

           0       0.63      1.00      0.77       900
           1       0.00      0.00      0.00       300
           2       0.00      0.00      0.00       226

    accuracy                           0.63      1426
   macro avg       0.21      0.33      0.26      1426
weighted avg       0.40      0.63      0.49      1426



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
