<a href="https://colab.research.google.com/github/axel-sirota/implement-nlp-word-embedding/blob/main/module3/Module3_Demo4_UsingGlove_For_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysing Sentiment

Let's first import everything and load the dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob, Word
import nltk
import torch
from torch import nn
import seaborn as sns
nltk.download('punkt')

%matplotlib inline
sns.set(rc={'figure.figsize':(20,20)})
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget -O yelp.csv https://raw.githubusercontent.com/axel-sirota/implement-nlp-word-embedding/main/module3/data/yelp.csv
fi
if [ ! -f glove.6B.100d.txt ]; then
  wget -O glove.6B.100d.txt https://www.dropbox.com/s/dl1vswq2sz5f1ws/glove.6B.100d.txt?dl=0
fi

Writing get_data.sh


In [3]:
!bash get_data.sh


--2022-05-25 20:26:16--  https://raw.githubusercontent.com/axel-sirota/implement-nlp-word-embedding/main/module3/data/yelp.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8091185 (7.7M) [text/plain]
Saving to: ‘yelp.csv’


2022-05-25 20:26:16 (88.9 MB/s) - ‘yelp.csv’ saved [8091185/8091185]

--2022-05-25 20:26:16--  https://www.dropbox.com/s/dl1vswq2sz5f1ws/glove.6B.100d.txt?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/dl1vswq2sz5f1ws/glove.6B.100d.txt [following]
--2022-05-25 20:26:17--  https://www.dropbox.com/s/raw/dl1vswq2sz5f1ws/glove.6B.100d.txt

In [4]:
CORPUS_SIZE = 500
EMBEDDING_DIM = 100
path = 'yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)].reset_index(drop=True)[:CORPUS_SIZE]
yelp_best_worst


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
3,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0
4,zp713qNhx8d9KCJJnrw1xA,2010-02-12,riFQ3vxNpP4rWLk_CSri2A,5,Drop what you're doing and drive here. After I...,review,wFweIWhv2fREZV_dYkz_1g,7,7,4
...,...,...,...,...,...,...,...,...,...,...
495,ScvhAJe5lIJ7_SIvNTDfgg,2012-08-24,OT9779o629UEC8gmI8Fa4g,5,So yummy!! My favorite Mexican place!! Love th...,review,I7zvmDRYtsLUHsLi50wDNA,1,1,0
496,YSesarsmfOIKwx6eiRqFZQ,2009-04-30,x6b0jru4OlUzfzeoNxvZEg,5,I have been taking vehicles to this place for ...,review,1f8_7NDng8w5CV7bshe63A,0,1,0
497,5nahZe5bBYUbFWgEfwoNOA,2011-03-28,n-MKaPksVzxBoTlda0gYzA,5,Thank you Sunflower for having a decent select...,review,BfetZm9fa0zqyAFn8vo_6w,0,0,0
498,gUt-pPUpOVVhaCFC8-E4yQ,2009-01-21,fZ6ktQYONjrOcK4SjTxS8w,5,Private Karaoke at Geisha A Go Go is the coole...,review,8gM0VBG-5vlS7teGszclKQ,2,2,1


## Doing the train_test split and defining model

In [5]:
# convert glove to word2vec format
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.6B.100d.txt", word2vec_output_file="emb_word2vec_format.txt")

(400001, 100)

In [6]:
import gensim
import torch
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('emb_word2vec_format.txt')

In [7]:
weights = torch.FloatTensor(word2vec_model.vectors).to(device)
tokenizer = lambda x: TextBlob(x).words

In [8]:
def get_maximum_review_length(df):
  maximum = 0
  for ix, row in df.iterrows():
    candidate = len(tokenizer(row.text))
    if candidate > maximum:
      maximum = candidate
  return maximum

In [9]:
maximum = get_maximum_review_length(yelp_best_worst)

In [10]:
weights.shape

torch.Size([400001, 100])

In [11]:
vocab_size = len(word2vec_model.index2word)

In [12]:
X_pre_split = torch.zeros(len(yelp_best_worst), maximum).type(torch.LongTensor).to(device)
for index, row in yelp_best_worst.iterrows():
  ix = 0
  for word in tokenizer(row.text):
    if word not in word2vec_model:
      representation = 0
    else:
      representation = word2vec_model.index2word.index(word)
    X_pre_split[index, ix] = representation
    ix += 1

In [13]:
X_pre_split[5]

tensor([    0,   970,    26,  3006,  9870,    17,  1174,    13,     0,  8172,
            0, 13921,  1410,    17,   181,     4,  4603,     0,    17,     0,
         7635, 13174, 40437,     5,     7, 20065,     0,   242,    13,   207,
           81,    86,    84,  1078,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [14]:
from sklearn.model_selection import train_test_split
X = X_pre_split
y = yelp_best_worst.stars.map({1:0, 5:1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [15]:
y_train = torch.Tensor(y_train.values).type(torch.LongTensor).to(device)
y_test = torch.Tensor(y_test.values).type(torch.LongTensor).to(device)

In [16]:
class SentimentPretrained(nn.Module):
    def __init__(self, embedding_dim):
        super(SentimentPretrained, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding.from_pretrained(weights)
        self.linear = nn.Linear(embedding_dim, 2)
        self.activation_function = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = torch.sum(self.embeddings(inputs), dim=1).squeeze()
        out = self.linear(embeds)
        out = self.activation_function(out)
        return out

In [17]:
model = SentimentPretrained(EMBEDDING_DIM).to(device)

In [18]:
def loss(y_pred, y):
  return nn.functional.nll_loss(y_pred, y)

def metric(y_pred, y):  # -> accuracy
  return (1 / len(y)) * ((y_pred.argmax(dim = 1) == y).sum())

optimizer = torch.optim.AdamW(model.parameters())

## Let's verify the metric makes sense

In [19]:
X_train[5]

tensor([    0,  1409,  1290,  2821,   285,   197,   219,    44, 22078,    32,
            0,   977,    20,    34,    20,   119,    70,    88,  1096,    10,
          285,     0, 12611, 20890,   117,   645,    81,    86,   169,    22,
            0,    34,  1495,     0,   626,     0,   134,   175,     7,     0,
            3,     0,     0,     0,   287,     0,   565,     4,  4374,   117,
           36,   120,   130,  4018, 12611,     0,     0,    40,     0,  5795,
           17, 60128, 15235,    13,     7, 71585,    17,   232,  1051,     3,
        12617,    48,    15, 30580,     5,     0,    68,    15, 38877,     0,
           20,    15,     0, 71585,  5872,     0,   269,     0, 30580, 12617,
           15,   116,    63,    34,     0,   346,    10,  1085,     0, 38877,
           15,    36,   116,    63,    20, 20890,   117,   645,     0,    40,
           22,     7, 13710,    42,    15,     6,     7, 36472,     0,    15,
           70,  1096,     4, 36887,   163,    59,     0,   238, 

In [20]:
weights.shape

torch.Size([400001, 100])

In [21]:
nn.Embedding.from_pretrained(weights)(X_train)

tensor([[[-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [ 0.5651,  0.5322, -0.4899,  ..., -0.1709,  1.2975,  0.0952],
         [-0.0720,  0.2313,  0.0237,  ..., -0.7189,  0.8689,  0.1954],
         ...,
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706]],

        [[-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.5706,  0.4418,  0.7010,  ..., -0.6610,  0.4720,  0.3725],
         ...,
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706]],

        [[-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.1230,  0.2373,  0.6737,  ...,  0

In [22]:
y_train_pred = model(X_train).to(device)
y_train_pred.argmax(dim = 1)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [23]:
y_train

tensor([1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
        1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
        0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
        1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
        1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,

In [24]:
(y_train_pred.argmax(dim = 1) == y_train).sum()

tensor(313, device='cuda:0')

In [25]:
metric(y_train_pred, y_train[:CORPUS_SIZE])

tensor(0.7825, device='cuda:0')

In [26]:
del y_train_pred

## The training routine

In [27]:
optimizer = torch.optim.AdamW(model.parameters())

In [28]:
epochs = 1000
for i in range(epochs):
  y_pred = model(X_train)
  xe = loss(y_pred, y_train)
  accuracy = metric(y_pred, y_train)
  xe.backward()
  if i % 100 == 0:
    print("Loss: ", xe, " Accuracy ", accuracy.data.item())
  optimizer.step()
  optimizer.zero_grad()

Loss:  tensor(39.9376, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7824999690055847
Loss:  tensor(0.9188, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7724999785423279
Loss:  tensor(0.8219, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7774999737739563
Loss:  tensor(0.7209, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7849999666213989
Loss:  tensor(0.6424, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.8025000095367432
Loss:  tensor(0.6436, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7799999713897705
Loss:  tensor(0.6014, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7849999666213989
Loss:  tensor(0.5662, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7999999523162842
Loss:  tensor(0.5362, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7999999523162842
Loss:  tensor(0.5108, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.8025000095367432


In [29]:
y_test_pred = model(X_test)
print(f'Model accuracy is {metric(y_test_pred, y_test)}')

Model accuracy is 0.8299999833106995


# Some manual validation

In [42]:
review = np.array(["This place was fantastic", "I had such a bad time"])
X_val = torch.zeros(len(review), maximum).type(torch.LongTensor).to(device)
for index, text in pd.Series(review).iteritems():
  ix = 0
  for word in tokenizer(text):
    if word not in word2vec_model:
      representation = 0
    else:
      representation = word2vec_model.index2word.index(word)
    X_val[index, ix] = representation
    ix += 1
X_val

tensor([[  0, 241,  15,  ...,   0,   0,   0],
        [  0,  40, 125,  ...,   0,   0,   0]], device='cuda:0')

In [44]:
prediction = model(X_val)
prediction.argmax(dim=1)

tensor([1, 0], device='cuda:0')

Therefore, the model predicted correctly that the first review was positive and the second negative!