# SentiPy Tutorial

### *An Application for Twitter Sentiment Analytics*

**SentiPy** provides models to analyze user's sentiments from tweets. The models are based on **Word Embeddings** and **Convolutional Neural Network** (CNN).

In [33]:
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.style.use('seaborn-darkgrid')

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Model Implementation

## 1.1. Preprocessing the data

The dataset used for this models is taken from [Sentiment140](http://www.sentiment140.com/) dataset which is composed of 1.6b tweets. The dataset is availble at [Stanford University](http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip). More help can be found at [this link](http://help.sentiment140.com/for-students/).

The custom torch dataset *Sentiment140* extract tweets and labels from the csv dataset, and download it in case you don't have it. Because *Sentiment140* class inherits from Dataset class, you have access to splits, iters and others default methods.

In [34]:
import torch
from torchtext import data

from sentipy.datasets import Sentiment140
from sentipy.tokenizer import tokenizer_tweets

In [35]:
import numpy as np
SEED = 2020
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x2303cdc8490>

In [36]:
TEXT = data.Field(tokenize = tokenizer_tweets, batch_first = True)
LABEL = data.LabelField(preprocessing = lambda y: y//2, dtype = torch.float)

# Get the training data / validation data / test data
train_data, test_data = Sentiment140.splits(TEXT, LABEL, keepneutral=False, size=50000, shuffle=True)
train_data, valid_data = train_data.split(0.8)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
print("Labels")
print([vars(train_data[i])["label"] for i in range(10)])

print(vars(train_data[0]))

Now, let's take a look at the loaded dataset.

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

In [7]:
print("Sanity Check\nTraining dataset length: {}".format(len(train_data) + len(valid_data)))
print("\nExample n°0: {}".format(vars(train_data[0])))

Sanity Check
Training dataset length: 50000

Example n°0: {'label': 2, 'id': 1759774113, 'date': 'Sun May 10 19:56:48 PDT 2009', 'query': 'NO_QUERY', 'user': 'melux', 'text': ['playing', 'medal', 'of', 'honor', 'with', 'my', 'brother']}


## 1.2. Processing the data

In [8]:
VOCAB_SIZE = 20000

# Create the vocabulary for words embeddings
TEXT.build_vocab(train_data, 
                 max_size = VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 # vectors = "glove.twitter.27B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [9]:
print(LABEL.vocab.stoi)

defaultdict(None, {0: 0, 2: 1})


In [10]:
print(TEXT.vocab.freqs.most_common(20))

[('!', 21435), ('.', 20468), ('<user>', 19848), ('i', 18989), ('to', 14024), ('the', 13180), (',', 11682), ('a', 9600), ('my', 7758), ('and', 7597), ('...', 6991), ('you', 6784), ('?', 5904), ('is', 5854), ('it', 5826), ('in', 5418), ('for', 5328), ('of', 4635), ('on', 4174), ('me', 4099)]


In [11]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '!', '.', '<user>', 'i', 'to', 'the', ',', 'a']


In [12]:
glove_vectors = TEXT.vocab.vectors
print(glove_vectors)
print(f'There are {len(glove_vectors)} words in the vocabulary\n')

idx = TEXT.vocab.itos.index("<user>")
print(f"vector: {glove_vectors[idx][:10]}, size: {len(glove_vectors[idx])}")

tensor([[-0.0087, -0.3213, -1.2899,  ...,  0.9333, -0.6701,  0.2908],
        [ 1.1236,  0.8868,  1.1304,  ..., -0.9977,  0.0336, -0.3832],
        [ 0.3847,  0.4935,  0.4910,  ...,  0.0263,  0.3905,  0.5222],
        ...,
        [-0.2991, -0.3692,  0.8557,  ...,  0.2474, -0.6483,  0.5548],
        [ 0.9752, -1.0016, -1.3873,  ...,  0.6071, -0.0471,  0.3319],
        [ 0.3994,  0.9867,  0.4112,  ...,  0.6024, -0.5983,  0.2528]])
There are 20002 words in the vocabulary

vector: tensor([-1.2655, -0.7780,  0.6882, -0.6532,  0.4089, -0.5350, -1.7817, -0.2428,
         0.5133,  1.1793]), size: 100


In [13]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.BucketIterator(train_data, 
                             shuffle=True,
                             batch_size = BATCH_SIZE, 
                             device = device)
valid_iterator = data.BucketIterator(valid_data,
                             shuffle=True,
                             batch_size = BATCH_SIZE, 
                             device = device)
test_iterator = data.BucketIterator(test_data,
                             shuffle=True,
                             batch_size = BATCH_SIZE, 
                             device = device)

In [14]:
print([vars(valid_data[i])["label"] for i in range(10)])
for (idx, batch) in enumerate(train_iterator):
    print(batch.label)
    break
for (idx, batch) in enumerate(test_iterator):
    print(batch.label)
    break

[0, 2, 2, 0, 0, 0, 2, 2, 0, 2]
tensor([1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 0.,
        0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0.,
        1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1.,
        1., 0., 1., 1., 1., 0., 0., 1., 1., 0.])
tensor([0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0.,
        0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0.,
        1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
        0., 0., 1., 1., 1., 0., 1., 1., 1., 1.])


## 1.3. Creating the model

In [15]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sentipy.model import CNN

# Hyperparameters
EPOCHS = 50
EMBEDDING_DIM = 100
N_FITLERS = 100
FILTER_SIZES = [2, 3, 4, 5, 8]
OUTPUT_DIM = 2
DROPOUT = 0.5
ACTIVATION_LAYER = F.relu
ACTIVATION_OUTPUT = F.sigmoid
LR = 0.1
WEIGHT_DECAY = 0.001

In [16]:
# Model
INPUT_DIM = len(TEXT.vocab)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]   
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FITLERS, FILTER_SIZES, 
            OUTPUT_DIM, DROPOUT, pad_idx = PAD_IDX, 
            activation_layer = ACTIVATION_LAYER,
            activation_output = ACTIVATION_OUTPUT)

# Optimization
optimizer = optim.Adadelta(model.parameters(), lr = LR, weight_decay = WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)  

## 1.4. Training the model

In [17]:
from sentipy.optim.performer import Performer

performer = Performer(model, criterion, optimizer)
performer.run(train_iterator, valid_iterator, epochs = 10)

Epoch     :   1/10




Stats Training     | Loss: 0.668 | Acc: 59.40% | Prec.: 59.40% | Rec.: 59.41% | F1: 59.40%
Stats Validation   | Loss: 0.638 | Acc: 64.84% | Prec.: 64.79% | Rec.: 64.79% | F1: 64.79%

Epoch     :   2/10
Stats Training     | Loss: 0.636 | Acc: 64.63% | Prec.: 64.63% | Rec.: 64.64% | F1: 64.63%
Stats Validation   | Loss: 0.620 | Acc: 67.12% | Prec.: 67.07% | Rec.: 67.22% | F1: 67.07%

Epoch     :   3/10
Stats Training     | Loss: 0.619 | Acc: 66.89% | Prec.: 66.89% | Rec.: 66.90% | F1: 66.89%
Stats Validation   | Loss: 0.610 | Acc: 68.06% | Prec.: 67.99% | Rec.: 68.60% | F1: 67.99%

Epoch     :   4/10
Stats Training     | Loss: 0.605 | Acc: 68.77% | Prec.: 68.77% | Rec.: 68.78% | F1: 68.77%
Stats Validation   | Loss: 0.600 | Acc: 69.47% | Prec.: 69.58% | Rec.: 69.60% | F1: 69.58%

Epoch     :   5/10
Stats Training     | Loss: 0.596 | Acc: 69.95% | Prec.: 69.95% | Rec.: 69.95% | F1: 69.95%
Stats Validation   | Loss: 0.593 | Acc: 70.48% | Prec.: 70.48% | Rec.: 70.50% | F1: 70.48%

Epoch    

## 1.5. Testing the model

In [18]:
performer.test(test_iterator)



0.6783168812592825

In [23]:
performer.results_test

{'loss': 0.5598957190910975,
 'accuracy': 0.7533660233020782,
 'precision': 0.7493150684931507,
 'recall': 0.7527777777777778,
 'macro_f1': 0.750273556231003,
 'confusion_matrix': [[51, 17], [22, 68]]}

In [31]:
performer.test(test_iterator, thresholds=(.8, .8), addneutral=True)



0.5519537081321081

In [32]:
performer.results_test

{'loss': 0.7613080143928528,
 'accuracy': 0.5519537081321081,
 'precision': 0.5580808080808081,
 'recall': 0.5580528846153846,
 'macro_f1': 0.5580332992727054,
 'confusion_matrix': [[35, 29], [28, 37]]}

In [20]:
performer.test(train_iterator, thresholds=(.8, .8), addneutral=True)



0.8859244243621827

In [21]:
performer.results_test

{'loss': 0.42733731942176817,
 'accuracy': 0.8859244243621827,
 'precision': 0.885114377426824,
 'recall': 0.8859048600187107,
 'macro_f1': 0.8853957452063368,
 'confusion_matrix': [[10125, 1367], [1121, 9141]]}

In [24]:
TEXT = data.Field(tokenize = tokenizer_tweets, batch_first = True)
LABEL = data.LabelField(preprocessing = lambda y: y//2, dtype = torch.float)

# Get the training data / validation data / test data
train_data, test_data = Sentiment140.splits(TEXT, LABEL, keepneutral=False, size=10000, shuffle=True)
train_data, valid_data = train_data.split(0.8)

In [25]:
VOCAB_SIZE = 20000

# Create the vocabulary for words embeddings
TEXT.build_vocab(train_data, 
                 max_size = VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 # vectors = "glove.twitter.27B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [26]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.BucketIterator(train_data, 
                             shuffle=True,
                             batch_size = BATCH_SIZE, 
                             device = device)
valid_iterator = data.BucketIterator(valid_data,
                             shuffle=True,
                             batch_size = BATCH_SIZE, 
                             device = device)
test_iterator = data.BucketIterator(test_data,
                             shuffle=True,
                             batch_size = BATCH_SIZE, 
                             device = device)

In [30]:
performer.test(test_iterator, thresholds=(.8, .8), addneutral=True)



0.5539246648550034