<a href="https://colab.research.google.com/github/axel-sirota/implement-nlp-word-embedding/blob/main/module3/Module3_Demo4_UsingGlove_For_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysing Sentiment

Let's first import everything and load the dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob, Word
import nltk
import torch
from torch import nn
import seaborn as sns
nltk.download('punkt')

%matplotlib inline
sns.set(rc={'figure.figsize':(20,20)})
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget -O yelp.csv https://raw.githubusercontent.com/axel-sirota/implement-nlp-word-embedding/main/module3/data/yelp.csv
fi
if [ ! -f glove.6B.100d.txt ]; then
  wget -O glove.6B.100d.txt https://www.dropbox.com/s/dl1vswq2sz5f1ws/glove.6B.100d.txt?dl=0
fi

Writing get_data.sh


In [3]:
!bash get_data.sh


--2022-05-28 18:11:33--  https://raw.githubusercontent.com/axel-sirota/implement-nlp-word-embedding/main/module3/data/yelp.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8091185 (7.7M) [text/plain]
Saving to: ‘yelp.csv’


2022-05-28 18:11:34 (191 MB/s) - ‘yelp.csv’ saved [8091185/8091185]

--2022-05-28 18:11:34--  https://www.dropbox.com/s/dl1vswq2sz5f1ws/glove.6B.100d.txt?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.18, 2620:100:6021:18::a27d:4112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/dl1vswq2sz5f1ws/glove.6B.100d.txt [following]
--2022-05-28 18:11:34--  https://www.dropbox.com/s/raw/dl1vswq2sz5f1ws/glove.6B.100d.t

In [4]:
CORPUS_SIZE = 10000
EMBEDDING_DIM = 100
path = 'yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)].reset_index(drop=True)[:CORPUS_SIZE]
yelp_best_worst


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
3,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0
4,zp713qNhx8d9KCJJnrw1xA,2010-02-12,riFQ3vxNpP4rWLk_CSri2A,5,Drop what you're doing and drive here. After I...,review,wFweIWhv2fREZV_dYkz_1g,7,7,4
...,...,...,...,...,...,...,...,...,...,...
4081,R8VwdLyvsp9iybNqRvm94g,2011-10-03,pcEeHdAJPoFNF23es0kKWg,5,Yes I do rock the hipster joints. I dig this ...,review,b92Y3tyWTQQZ5FLifex62Q,1,1,1
4082,WJ5mq4EiWYAA4Vif0xDfdg,2011-12-05,EuHX-39FR7tyyG1ElvN1Jw,5,Only 4 stars? \n\n(A few notes: The folks that...,review,hTau-iNZFwoNsPCaiIUTEA,1,1,0
4083,f96lWMIAUhYIYy9gOktivQ,2009-03-10,YF17z7HWlMj6aezZc-pVEw,5,I'm not normally one to jump at reviewing a ch...,review,W_QXYA7A0IhMrvbckz7eVg,2,3,2
4084,L3BSpFvxcNf3T_teitgt6A,2012-03-19,0nxb1gIGFgk3WbC5zwhKZg,5,Let's see...what is there NOT to like about Su...,review,OzOZv-Knlw3oz9K5Kh5S6A,1,2,1


## Doing the train_test split and defining model

In [5]:
# convert glove to word2vec format
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.6B.100d.txt", word2vec_output_file="emb_word2vec_format.txt")

(400001, 100)

In [6]:
import gensim
import torch
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('emb_word2vec_format.txt')

In [7]:
weights = torch.FloatTensor(word2vec_model.vectors).to(device)
tokenizer = lambda x: TextBlob(x).words

In [8]:
def get_maximum_review_length(df):
  maximum = 0
  for ix, row in df.iterrows():
    candidate = len(tokenizer(row.text))
    if candidate > maximum:
      maximum = candidate
  return maximum

In [9]:
maximum = get_maximum_review_length(yelp_best_worst)

In [10]:
weights.shape

torch.Size([400001, 100])

In [11]:
vocab_size = len(word2vec_model.index2word)

In [12]:
X_pre_split = torch.zeros(len(yelp_best_worst), maximum).type(torch.LongTensor).to(device)
for index, row in yelp_best_worst.iterrows():
  ix = 0
  for word in tokenizer(row.text):
    if word not in word2vec_model:
      representation = 0
    else:
      representation = word2vec_model.index2word.index(word)
    X_pre_split[index, ix] = representation
    ix += 1

In [13]:
X_pre_split[5]

tensor([    0,   970,    26,  3006,  9870,    17,  1174,    13,     0,  8172,
            0, 13921,  1410,    17,   181,     4,  4603,     0,    17,     0,
         7635, 13174, 40437,     5,     7, 20065,     0,   242,    13,   207,
           81,    86,    84,  1078,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [14]:
from sklearn.model_selection import train_test_split
X = X_pre_split
y = yelp_best_worst.stars.map({1:0, 5:1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [15]:
y_train = torch.Tensor(y_train.values).type(torch.LongTensor).to(device)
y_test = torch.Tensor(y_test.values).type(torch.LongTensor).to(device)

In [16]:
class SentimentPretrained(nn.Module):
    def __init__(self, embedding_dim):
        super(SentimentPretrained, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding.from_pretrained(weights)
        self.linear = nn.Linear(embedding_dim, 2)
        self.activation_function = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = torch.sum(self.embeddings(inputs), dim=1).squeeze()
        out = self.linear(embeds)
        out = self.activation_function(out)
        return out

In [17]:
model = SentimentPretrained(EMBEDDING_DIM).to(device)

In [18]:
def loss(y_pred, y):
  return nn.functional.nll_loss(y_pred, y)

def metric(y_pred, y):  # -> accuracy
  return (1 / len(y)) * ((y_pred.argmax(dim = 1) == y).sum())

optimizer = torch.optim.AdamW(model.parameters())

## Let's verify the metric makes sense

In [19]:
X_train[5]

tensor([    0,    14,    22,   442, 15968,  2227,  2905,     5,   120,   575,
            0,   296,   433,   285,   303,     4, 19761,  3261,     6,     0,
          621,    19,     7,     0, 14748,  1856,     0,   769,    58,  3995,
            0,  3904,     5,    17,   219,  1247,    20,     9,  7014,  8208,
           17, 15606,    34,    36,     0, 31410,   403,     5,   151,   439,
           39,    33,     7,   682,     0,    38,  1432,   131,    84,   357,
         2194,     4,   960,   550,     5,   578,   654,  4537,   983,   392,
            0,  1738,  7210,  1221,     0,     0,    58,   416,     0,    20,
           40,    29,  4546, 73806,     0, 54286,  2262,    25,     0,  6477,
           42,    15,     7, 49171,  4363,  2365,     0,  1015,   561,     0,
         2069,    15,  2215,  2012,     6,  1469,     5,  1321,     0,    15,
          372,   317,  4287,    61,     0, 11544, 22469,    12,    39,    35,
            0,  1469,     3, 59756,     5,     0,  1321,    35, 

In [20]:
weights.shape

torch.Size([400001, 100])

In [21]:
nn.Embedding.from_pretrained(weights)(X_train)

tensor([[[-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [ 0.0587,  0.4027,  0.3863,  ..., -0.3597,  0.4372,  0.1012],
         ...,
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706]],

        [[-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [ 0.6326, -0.1272, -0.0842,  ..., -0.3097,  0.2238,  0.0382],
         [-0.2709,  0.0440, -0.0203,  ..., -0.4923,  0.6369,  0.2364],
         ...,
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
         [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706]],

        [[-0.2332,  0.3689,  0.0283,  ..., -0.2388,  0.5863, -0.7346],
         [ 0.0420,  0.5666,  0.5549,  ...,  0

In [22]:
y_train_pred = model(X_train).to(device)
y_train_pred.argmax(dim = 1)

tensor([1, 1, 1,  ..., 1, 1, 1], device='cuda:0')

In [23]:
y_train

tensor([1, 1, 0,  ..., 1, 0, 1], device='cuda:0')

In [24]:
(y_train_pred.argmax(dim = 1) == y_train).sum()

tensor(2686, device='cuda:0')

In [25]:
metric(y_train_pred, y_train[:CORPUS_SIZE])

tensor(0.8219, device='cuda:0')

In [26]:
del y_train_pred

## The training routine

In [27]:
optimizer = torch.optim.AdamW(model.parameters())

In [28]:
epochs = 1000
for i in range(epochs):
  y_pred = model(X_train)
  xe = loss(y_pred, y_train)
  accuracy = metric(y_pred, y_train)
  xe.backward()
  if i % 100 == 0:
    print("Loss: ", xe, " Accuracy ", accuracy.data.item())
  optimizer.step()
  optimizer.zero_grad()

Loss:  tensor(27.1740, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.8219094276428223
Loss:  tensor(1.0527, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7698898315429688
Loss:  tensor(0.9909, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7851897478103638
Loss:  tensor(0.9265, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7922276854515076
Loss:  tensor(0.8647, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.7998776435852051
Loss:  tensor(0.8768, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.8240514397621155
Loss:  tensor(0.8625, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.825581431388855
Loss:  tensor(0.8360, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.8307833671569824
Loss:  tensor(0.7753, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.8341493606567383
Loss:  tensor(0.7348, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  0.8375152945518494


In [29]:
y_test_pred = model(X_test)
print(f'Model accuracy is {metric(y_test_pred, y_test)}')

Model accuracy is 0.7958435416221619


# Some manual validation

In [30]:
review = np.array(["This place was fantastic", "I had such a bad time"])
X_val = torch.zeros(len(review), maximum).type(torch.LongTensor).to(device)
for index, text in pd.Series(review).iteritems():
  ix = 0
  for word in tokenizer(text):
    if word not in word2vec_model:
      representation = 0
    else:
      representation = word2vec_model.index2word.index(word)
    X_val[index, ix] = representation
    ix += 1
X_val

tensor([[  0, 241,  15,  ...,   0,   0,   0],
        [  0,  40, 125,  ...,   0,   0,   0]], device='cuda:0')

In [31]:
prediction = model(X_val)
prediction.argmax(dim=1)

tensor([1, 1], device='cuda:0')

Therefore, the model predicted correctly that the first review was positive and the second negative!