Tweet sentiment extraction using PyTorch from Kaggle: https://www.kaggle.com/competitions/tweet-sentiment-extraction

In [1]:
import os
import re
import shutil
import string

from collections import Counter

import pandas as pd
import numpy as np

import sklearn

from sklearn.model_selection import train_test_split

Functions to remove emoji and url and clean text

In [2]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" #emoticons
                               u"\U0001F300-\U0001F5FF" #symbols and pictographs
                               u"\U0001F680-\U0001F6FF" #transport and map symbols
                               u"\U0001F1E0-\U0001F1FF" #flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_url(text):
    url_pattern = re.compile('https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text) #converting return value from list to string


def clean_text(text):
    delete_dict = {sp_character: '' for sp_character in string.punctuation}
    delete_dict[' '] = ' '
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    
    textArr = text1.split()
    text2 = ' '.join([w for w in textArr if (not w.isdigit() and (not w.isdigit() and len(w) > 2))])
    
    return text2.lower()

In [3]:
def get_sentiment(sentiment):
    if sentiment == 'positive':
        return 2
    elif sentiment == 'negative':
        return 1
    else:
        return 0

In [4]:
train_data = pd.read_csv('datasets/train.csv')
train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
train_data.dropna(axis=0, inplace=True)

In [6]:
train_data['num_words'] = train_data['text'].apply(lambda x: len(str(x).split()))
mask = train_data['num_words'] > 2
train_data = train_data[mask]

train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment,num_words
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,7
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,10
2,088c60f138,my boss is bullying me...,bullying me,negative,5
3,9642c003ef,what interview! leave me alone,leave me alone,negative,5
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,14


In [7]:
max_train_sentence_len = train_data['num_words'].max()

In [8]:
train_data['text'] = train_data['text'].apply(remove_emoji)
train_data['text'] = train_data['text'].apply(remove_url)
train_data['text'] = train_data['text'].apply(clean_text)

train_data['label'] = train_data['sentiment'].apply(get_sentiment)

In [9]:
test_data = pd.read_csv('datasets/test.csv')
test_data.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [10]:
test_data.dropna(axis=0, inplace=True)

In [11]:
test_data['num_words'] = test_data['text'].apply(lambda x: len(str(x).split()))
mask = test_data['num_words'] > 2
test_data = test_data[mask]

max_test_sentence_len = test_data['num_words'].max()

In [12]:
test_data['text'] = test_data['text'].apply(remove_emoji)
test_data['text'] = test_data['text'].apply(remove_url)
test_data['text'] = test_data['text'].apply(clean_text)

test_data['label'] = test_data['sentiment'].apply(get_sentiment)

In [13]:
train_data.head(10)

Unnamed: 0,textID,text,selected_text,sentiment,num_words,label
0,cb774db0d1,have responded were going,"I`d have responded, if I were going",neutral,7,0
1,549e992a42,sooo sad will miss you here san diego,Sooo SAD,negative,10,1
2,088c60f138,boss bullying,bullying me,negative,5,1
3,9642c003ef,what interview leave alone,leave me alone,negative,5,1
4,358bd9e861,sons why couldnt they put them the releases al...,"Sons of ****,",negative,14,1
5,28b57f3990,some shameless plugging for the best rangers f...,http://www.dothebouncy.com/smf - some shameles...,neutral,12,0
6,6e0c6d75b1,2am feedings for the baby are fun when all smi...,fun,positive,14,2
8,e050245fbd,both you,Both of you,neutral,3,0
9,fc2cbefa9d,journey wow just became cooler hehe that possible,Wow... u just became cooler.,positive,10,2
10,2339a9b08b,much love hopeful reckon the chances are minim...,"as much as i love to be hopeful, i reckon the ...",neutral,23,0


In [14]:
test_data.head(10)

Unnamed: 0,textID,text,sentiment,num_words,label
0,f87dea47db,last session the day,neutral,6,0
1,96d74cb729,shanghai also really exciting precisely skyscr...,positive,15,2
2,eee518ae67,recession hit veronique branquinho she has qui...,negative,13,1
4,33987a8ee5,like,positive,5,2
5,726e501993,thats great weee visitors,positive,4,2
6,261932614e,think everyone hates here lol,negative,8,1
7,afa11da83f,soooooo wish could but school and myspace comp...,negative,13,1
8,e64208b4ef,and within short time the last clue all them,neutral,12,0
9,37bcad24ca,what did you get day alright havent done anyth...,neutral,18,0
10,24c92644a4,bike was put holdshould have known that argh t...,negative,12,1


In [15]:
x_train, x_val, y_train, y_val = train_test_split(train_data['text'].tolist(),\
                                                       train_data['label'].tolist(),\
                                                       test_size=0.2,\
                                                       stratify=train_data['label'].tolist(),\
                                                       random_state=0)

print('Class distributions:')
print(Counter(y_train))
print(Counter(y_val))
print(Counter(test_data['label'].tolist()))

Class distributions:
Counter({0: 8563, 2: 6700, 1: 6138})
Counter({0: 2141, 2: 1675, 1: 1535})
Counter({0: 1376, 2: 1075, 1: 983})


In [16]:
#Creating iterators --- PyTorch tutorial

train = list(zip(y_train, x_train))
val = list(zip(y_val, x_val))
test = list(zip(test_data['label'].tolist(), test_data['text'].tolist()))

Using PyTorch for training

In [17]:
import torch
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Creating Vocabulary on training data

In [18]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokeniser = get_tokenizer('basic_english')
train_iter = train

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokeniser(text)
        
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [19]:
text_pipeline = lambda x: vocab(tokeniser(x))
label_pipeline = lambda x: int(x)

In [20]:
text_pipeline('here is an example')

[62, 0, 0, 12881]

In [21]:
label_pipeline('0')

0

In [22]:
#Create batches in Dataloader

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for(_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list =torch.cat(text_list)
    return label_list, text_list, offsets

Creating text Classification Model

In [23]:
from torch import nn
import torch.nn.functional as F

class TextClassificationModel(nn.Module):
    
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc1 = nn.Linear(embed_dim, 64)
        self.fc2 = nn.Linear(64, 16)
        self.fc3 = nn.Linear(16, num_class)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()
        self.fc3.weight.data.uniform_(-initrange, initrange)
        self.fc3.bias.data.zero_()
        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [24]:
train_iter = train
num_class = len(set([label for (label, text) in train_iter]))
print(num_class)

vocab_size = len(vocab)
emsize = 128

3


In [25]:
model = TextClassificationModel(vocab_size, emsize, num_class)

In [26]:
import time

def training(dataloader):
    model.train()
    total_acc, total_count = 0.0, 0.0
    log_interval = 500
    start_time = time.time()
    
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimiser.zero_grad()
        pred = model(text, offsets)
        loss = criterion(pred, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimiser.step()
        total_acc += (pred.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                 '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader), total_acc/total_count))
            total_acc, total_acc = 0.0, 0.0
            start_time = time.time()
            

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0.0, 0.0
    
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            pred = model(text, offsets)
            loss = criterion(pred, label)
            total_acc += (pred.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [27]:
#from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

#Hyperparameters
EPOCHS = 10 #epochs
LR = 10 #learning rate
BATCH_SIZE = 16 #batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimiser = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimiser, 1.0, gamma=0.1)
total_acc = None

iter_train = train
iter_test = test
iter_val = val


train_dataloader = DataLoader(iter_train,
                             batch_size=BATCH_SIZE,
                             shuffle=True,
                             collate_fn=collate_batch)

val_dataloader = DataLoader(iter_val,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            collate_fn=collate_batch)

test_dataloader = DataLoader(iter_test,
                             batch_size=BATCH_SIZE,
                             shuffle=True,
                             collate_fn=collate_batch)

for epoch in range (1, EPOCHS + 1):
    epoch_start_time = time.time()
    training(train_dataloader)
    acc_val = evaluate(val_dataloader)
    if total_acc is not None and total_acc > acc_val:
        scheduler.step()
    else:
        total_acc = acc_val
    
    print('-' * 65)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
         'val accuracy {:8.3f}'.format(epoch, time.time() - epoch_start_time, acc_val))
    print('-' * 65)

| epoch   1 |   500/ 1338 batches | accuracy    0.461
| epoch   1 |  1000/ 1338 batches | accuracy    0.264
-----------------------------------------------------------------
| end of epoch   1 | time:  2.80s | val accuracy    0.469
-----------------------------------------------------------------
| epoch   2 |   500/ 1338 batches | accuracy    0.506
| epoch   2 |  1000/ 1338 batches | accuracy    0.255
-----------------------------------------------------------------
| end of epoch   2 | time:  2.77s | val accuracy    0.516
-----------------------------------------------------------------
| epoch   3 |   500/ 1338 batches | accuracy    0.525
| epoch   3 |  1000/ 1338 batches | accuracy    0.256
-----------------------------------------------------------------
| end of epoch   3 | time:  1.87s | val accuracy    0.519
-----------------------------------------------------------------
| epoch   4 |   500/ 1338 batches | accuracy    0.541
| epoch   4 |  1000/ 1338 batches | accuracy    0.26

Checking the results on test data

In [28]:
test_acc = evaluate(test_dataloader)
print('Test accuracy {:8.3f}'.format(test_acc))

Test accuracy    0.608


In [29]:
sentiment_label = {2:"Positive",
                   1: "Negative",
                   0: "Neutral"
                  }

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() 
ex_text_str = "soooooo wish i could, but im in school and myspace is completely blocked"


print("This is a %s tweet" %sentiment_label[predict(ex_text_str, text_pipeline)])

This is a Neutral tweet
