<a href="https://colab.research.google.com/github/ashtheflash1212/Sentiment-Analysis-Model/blob/main/Sentiment_Analysis_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis

Corresponding sentiment as a number between -1 and 1 (completely negative to completely positive).


First, we'll download the raw text file.

In [1]:
!git clone https://github.com/gptandchill/sentiment-analysis
%cd sentiment-analysis

fatal: destination path 'sentiment-analysis' already exists and is not an empty directory.
/content/sentiment-analysis


In [2]:
import torch
import torch.nn as nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

list_of_strings = []
list_of_labels = []

import csv
with open('EcoPreprocessed.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
      list_of_strings.append(row[1])
      list_of_labels.append(float(row[2]))


In [3]:
def get_dataset(list_of_strings):

    # First let's get the total set of words
    # Ex. What a movie, The movie sucks, The movie is disappointing
    words = set()
    for sentence in list_of_strings:
        for word in sentence.split():
            words.add(word)
    # words = What a movie The movie sucks The movie is disappointing
    vocab_size = len(words)

    # Now let's build a mapping
    sorted_list = sorted(list(words))
    # words sorted in lexicological order
    word_to_int = {}
    for i, c in enumerate(sorted_list):
        word_to_int[c] = i + 1
    # words mapped to an index starting from 1, Ex. "first word" : 1, "second word" : 2
    # Write encode() which is used to build the dataset

    def encode(sentence):
        integers = []
        for word in sentence.split():
        # What a movie
            integers.append(word_to_int[word])
            # finds the 'index' mapped to the sentence so for ex. "What", "a", "movie" could have indexes of "4853", "2", "1345"
        return integers

    var_len_tensors = []
    for sentence in list_of_strings:
        var_len_tensors.append(torch.tensor(encode(sentence)))
        # turning the integers array into a tensor

    return vocab_size + 1, nn.utils.rnn.pad_sequence(var_len_tensors, batch_first = True), word_to_int
    # .pad_sequence, to make sure the matrix is rectangular so padded with 0

In [4]:
vocab_size, training_dataset, word_to_int = get_dataset(list_of_strings)
training_labels = torch.unsqueeze(torch.tensor(list_of_labels), dim = -1)

In [5]:
class EmotionPredictor(nn.Module):
    def __init__(self, vocabulary_size: int, embedding_dimension: int):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocabulary_size, embedding_dimension)
        self.linear_layer = nn.Linear(embedding_dimension, 1)
        self.tanh = nn.Tanh() # instead of sigmoid here to give output fro -1 to 1 instead of from 0 to 1

    def forward(self, x):
        embeddings = self.embedding_layer(x)
        averaged = torch.mean(embeddings, axis = 1)
        projected = self.linear_layer(averaged)
        return self.tanh(projected)

In [31]:
embedding_dimension = 256
model = EmotionPredictor(vocab_size, embedding_dimension)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters()) # does gradient descent, updating the "w's" and also updating the learning rate

for i in range(1000):
  randperm = torch.randperm(len(training_dataset))
  training_dataset, training_labels = training_dataset[randperm], training_labels[randperm]
  pred = model(training_dataset) # calls forward method
  optimizer.zero_grad() # recalculates derivatives to update weights for each iteration
  loss = loss_function(pred, training_labels)
  if i % 100 == 0:
    print(loss.item())
  loss.backward() # calculates every derivative to perform gradient descent for use
  optimizer.step() # new_w = old_w - derivative * learning_rate, trying to minimize loss after calculating derivatives previously

0.6412087082862854
0.12701615691184998
0.12042686343193054
0.11218445003032684
0.10190583020448685
0.09074810147285461
0.08028310537338257
0.07125940173864365
0.06369207054376602
0.057357057929039


Some new examples...


In [35]:
example_one = " worst movie ever "

example_two = "best movie ever"

example_three = "weird but funny movie"

example_four = "horrible movie"



examples = [example_one] + [example_two] + [example_three] + [example_four]

# Let's encode these strings as numbers using the dictionary from earlier
var_len = []
for example in examples:
  int_version = []
  for word in example.split():
    int_version.append(word_to_int[word])
  var_len.append(torch.tensor(int_version))

testing_tensor = torch.nn.utils.rnn.pad_sequence(var_len, batch_first=True)
model.eval()

print(model(testing_tensor).tolist())


[[-0.9999999403953552], [0.9999993443489075], [0.5148182511329651], [-0.9616370797157288]]
