In [1]:
import pandas as pd
import numpy as np
import math
import time
import os

import torch
import torch.nn as nn
import torchvision
from torchvision import datasets, transforms

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer

from torchtext import data, datasets
from torchtext.vocab import GloVe

import torch.nn.functional as F

import re

In [2]:
train=pd.read_csv("data/train.csv")
evaluation=pd.read_csv("data/evaluation.csv")

Sentiment classifier v1

In [4]:
def load_train_test_imdb_data(data_dir):
    """Loads the IMDB train/test datasets from a folder path.
    Input:
    data_dir: path to the "aclImdb" folder.
    
    Returns:
    train/test datasets as pandas dataframes.
    """

    data = {}
    for split in ["train", "test"]:
        data[split] = []
        for sentiment in ["neg", "pos"]:
            score = 1 if sentiment == "pos" else 0

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)
            for f_name in file_names:
                with open(os.path.join(path, f_name), "r", encoding="utf-8") as f:
                    review = f.read()
                    data[split].append([review, score])

    np.random.shuffle(data["train"])        
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment'])

    np.random.shuffle(data["test"])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text', 'sentiment'])

    return data["train"], data["test"]

In [5]:
train_data, test_data = load_train_test_imdb_data(data_dir="aclImdb/")

In [22]:

twitter_data = pd.read_csv("sentiment_data_set/training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", header=None)
twitter_data.columns = ["sentiment", "time", "date", "query", "username", "text"]
nb_tweets = len(twitter_data["text"])
print(f'{nb_tweets} tweets.')


1600000 tweets.


In [45]:
import random

indices = random.sample(range(nb_tweets), 1000000)

twitter_train_data = twitter_data.iloc[indices[:500000]]
nb_train_tweets = len(twitter_train_data["text"])
print(f'{nb_train_tweets} train tweets.')

twitter_test_data = twitter_data.iloc[indices[500000:]]
nb_test_tweets = len(twitter_test_data["text"])
print(f'{nb_test_tweets} test tweets.')


500000 train tweets.
500000 test tweets.


In [10]:
def remove_url(text):
    
    while "http" in text :
        start_url = 0
        while (start_url<len(text)-3 and text[start_url:start_url+4] != "http") :
            start_url += 1
        end_url = start_url + 4
        while (end_url<len(text) and text[end_url] != " ") :
            end_url += 1
            
        text=text[:start_url]+text[end_url+1:]
        
    return text
    

def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    #remove urls
    text = remove_url(text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

In [46]:
# Transform each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english",preprocessor=clean_text)

training_features = vectorizer.fit_transform(twitter_train_data["text"])    
test_features = vectorizer.transform(twitter_test_data["text"])

# Training
model = LinearSVC()
model.fit(training_features, twitter_train_data["sentiment"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(twitter_test_data["sentiment"], y_pred)

print("Accuracy on the twitter dataset: {:.2f}".format(acc*100))

Accuracy on the twitter dataset: 75.72


In [47]:
covid_tweet_features = vectorizer.transform(train["text"])
covid_tweet_sentiment_pred = model.predict(covid_tweet_features)
train["sentiment"] = covid_tweet_sentiment_pred
train

Unnamed: 0,id,timestamp,retweet_count,user_verified,user_statuses_count,user_followers_count,user_friends_count,user_mentions,urls,hashtags,text,sentiment
0,0,1588696955143,0,False,68460,1101,1226,,,,Smh I give up,0
1,1,1588464948124,0,False,309,51,202,,,,"Most of us are Human Beings, but I think you m...",0
2,2,1588634673360,0,False,3241,1675,2325,,,,"Old dirty tricks Trump, at it again...like we ...",4
3,3,1588433158672,0,False,32327,667,304,,,,Seriously..... I worked 86 hours my last check...,0
4,4,1588582751599,0,False,581,42,127,,,,May ALMIGHTY ALLAH have mercy on us all. Only ...,4
...,...,...,...,...,...,...,...,...,...,...,...,...
665772,665772,1588412684317,0,False,65355,1984,1902,,,,18 months dawg? Come on man...,0
665773,665773,1588324521711,1,False,1807,2029,347,StanfordEMED,twitter.com/i/web/status/1…,COVID19,Thank you to all of the nurses in our @Stanfor...,4
665774,665774,1588353174952,8,False,888,85,257,,twitter.com/i/web/status/1…,,"'Post it' pearls for Palliative, End of Life a...",4
665775,665775,1588691378352,0,False,452,38,91,,,,His facial expressions are kind of looking for...,4


Sentiment classifier v2 (didn't use it on our covid data for now)

In [7]:
TEXT = data.Field(lower=True, stop_words=["the", "a", "to", "in", "at", "that"]) #we can remove some stopwords
LABEL = data.Field(sequential=False, unk_token=None) #labels are textual so we need to process them too
train1, val, test1 = datasets.SST.splits(TEXT, LABEL)

In [10]:
TEXT.build_vocab(train1, max_size=10000,min_freq=5, vectors=GloVe(name='6B', dim=100)) #convert words to indices, attach vectors to indices
LABEL.build_vocab(train1) #need to convert the textual labels to integers

.vector_cache/glove.6B.zip: 862MB [07:42, 1.86MB/s]                               
100%|█████████▉| 399999/400000 [00:24<00:00, 16285.04it/s]


In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #if you have a GPU with CUDA installed, this may speed up computation

In [22]:
train_iter, test_iter = data.BucketIterator.splits((train1, test1), batch_size=32, device=device)

print(train1[0].text, train1[0].label)

#print vocabulary information:
print('len(TEXT.vocab)', len(TEXT.vocab))
print('Most frequent terms', TEXT.vocab.freqs.most_common(10))
print('Conversion of labels to numeric:', LABEL.vocab.stoi)

['rock', 'is', 'destined', 'be', '21st', 'century', "'s", 'new', '``', 'conan', "''", 'and', 'he', "'s", 'going', 'make', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.'] positive
len(TEXT.vocab) 3422
Most frequent terms [('.', 8024), (',', 7131), ('and', 4473), ('of', 4396), ('is', 2561), ("'s", 2544), ('it', 2422), ('as', 1296), ('but', 1172), ('film', 1162)]
Conversion of labels to numeric: defaultdict(None, {'positive': 0, 'negative': 1, 'neutral': 2})


In [24]:
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        """
        vocab_size: (int) size of the vocabulary - required by embeddings
        embed_dim: (int) size of embeddings
        num_class: (int) number of classes
        """
        super().__init__()
        #enter here your code
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim,embed_dim,num_layers=2, bidirectional=True)
        self.fc = nn.Linear(embed_dim*2, num_class)
        self.softmax = nn.LogSoftmax(dim=-1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, text):
        r"""
        Arguments:
            text: 1-D tensor representing a bag of text tensors
        """
        #ENTER HERE YOUR CODE
        text=self.embedding(text)
        output, (hidden, cell) = self.lstm(text)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        
        x=self.fc(hidden)
        
        return self.softmax(x)

In [31]:
model = SentimentClassifier(3422,100,3)
optimizer = torch.optim.RMSprop(model.parameters(),lr=1e-3)

model.train() #set train mode
for epoch in range(5):
    running_loss = 0.0
    running_acc = 0
    for batch in train_iter:
        text =batch.text
        target=batch.label 
        optimizer.zero_grad()
        output = model(text)
        loss = F.nll_loss(output,target)
        running_loss +=loss
        preds = output.data.max(dim=1,keepdim=True)[1]
        running_acc += preds.eq(target.data.view_as(preds)).cpu().sum()
        loss.backward()
        optimizer.step()
        loss = running_loss/len(train)

        accuracy = 100. * running_acc/len(train)
    print(f'training loss is {loss:{5}.{2}} and training accuracy is {running_acc}/{len(train)}{accuracy:{10}.{4}}')

training loss is 0.032 and training accuracy is 4089/8544     47.86
training loss is 0.028 and training accuracy is 5077/8544     59.42
training loss is 0.025 and training accuracy is 5602/8544     65.57
training loss is 0.021 and training accuracy is 6098/8544     71.37
training loss is 0.018 and training accuracy is 6514/8544     76.24
