In [1]:
!pip install torchtext
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as pt
import plotly.express as px
import nltk
import torchtext
import datasets 
import sys
from pathlib import Path
import os
sys.path.append(Path(os.path.abspath(".")).parent.parent.absolute())

print("Setup complete.")

Setup complete.


In [3]:
df = pd.read_csv("../data/airline_sentiment_analysis.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [5]:
df = df.drop(labels=["Unnamed: 0"], axis=1)

In [6]:
df.head()

Unnamed: 0,airline_sentiment,text
0,positive,@VirginAmerica plus you've added commercials t...
1,negative,@VirginAmerica it's really aggressive to blast...
2,negative,@VirginAmerica and it's a really big bad thing...
3,negative,@VirginAmerica seriously would pay $30 a fligh...
4,positive,"@VirginAmerica yes, nearly every time I fly VX..."


## Trying to understand the nature of the dataset

This can mean various points - whether the dataset vocabulary is large, what is the word frequency distribution and so on.

### Getting the vocabulary size of the dataset

In [40]:
df.count  # Number of sentences

<bound method DataFrame.count of       airline_sentiment                                               text
0              positive  @VirginAmerica plus you've added commercials t...
1              negative  @VirginAmerica it's really aggressive to blast...
2              negative  @VirginAmerica and it's a really big bad thing...
3              negative  @VirginAmerica seriously would pay $30 a fligh...
4              positive  @VirginAmerica yes, nearly every time I fly VX...
...                 ...                                                ...
11536          negative  @AmericanAir my flight was Cancelled Flightled...
11537          negative         @AmericanAir right on cue with the delays👌
11538          positive  @AmericanAir thank you we got on a different f...
11539          negative  @AmericanAir leaving over 20 minutes Late Flig...
11540          negative  @AmericanAir you have my money, you change my ...

[11541 rows x 2 columns]>

In [41]:
sum([len(sentence.split()) for sentence in df["text"]])/df.count()  # Average number of words per sentence

airline_sentiment    18.524651
text                 18.524651
dtype: float64

In [9]:
from nltk.tokenize import word_tokenize

words = [word.lower() for sentence in df["text"] for word in sentence.split(" ")]
unique_words = list(set(words))  # Number of unique words in the dataset
print(len(unique_words), len(words))

22602 215542


As we can see, there are 22602 unique words in 23082 sentences, with each sentence containing around 9 words in average. This indicates a huge variety in the vocabulary.
Thus we need to concentrate on models that works well with low probabilities and small datasets.

Thus, we will use the Skipgram Model to to generate word embeddings for the vocabulary.

In [10]:
word_freq_dict = {word: 0 for word in unique_words}
word_freq_dict_df = {"word": [], "freq": []}

for word in words:
    word_freq_dict[word] += 1

for word, freq in word_freq_dict.items():
    word_freq_dict_df["word"].append(word)
    word_freq_dict_df["freq"].append(freq)

In [11]:
pd.DataFrame(word_freq_dict_df)

Unnamed: 0,word,freq
0,,1809
1,hey...,1
2,4439,1
3,num,1
4,flight🍸#sfo,1
...,...,...
22597,"captured,",1
22598,each.,1
22599,bucks,4
22600,495,2


In [12]:
word_tokenize("we're")

['we', "'re"]

In [13]:
from nltk import WordNetLemmatizer
wnl = WordNetLemmatizer()

list(map(wnl.lemmatize, word_tokenize("I can't... do this")))

['I', 'ca', "n't", '...', 'do', 'this']

In [14]:
from torch import nn
from typing import List, Dict, Any, Optional, Union, Tuple
import torch.functional as F
import torch

class Word2VecSkipgramModel(nn.Module):
    """This model learns word embeddings from the Twitter corpus using the Skipgram technique."""

    def __init__(
        self,
        vocabulary_size: int,
        embedding_size: int,
        *args: Optional[List[Any]],
        **kwargs: Optional[Dict[str, Any]]
    ):
        super(Word2VecSkipgramModel, self).__init__(*args, **kwargs)
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(self.vocabulary_size, self.embedding_size)
        self.neural_network = nn.Linear(self.embedding_size, self.vocabulary_size)

    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
        embedding = self.embedding(input_tensor)
        embedding = self.neural_network(embedding)
        return embedding


## Prepare the data

### Split the text dataframe into train, test and validation data.

In [44]:
train_percent = 60
test_percent = 20
val_percent = 20

train_size = int(len(df) * train_percent / 100) + 1
test_size = int(len(df) * test_percent / 100)
val_size = int(len(df) * val_percent / 100)

len(df) == (train_size + test_size + val_size)


True

In [69]:
df["tokenized_text"] = df["text"].apply(lambda x: x.split())

In [66]:
train_data = df["text"].iloc[:train_size]
test_data = df["text"].iloc[train_size: train_size + test_size]
val_data = df["text"].iloc[train_size + test_size: train_size + test_size + val_size]

In [70]:
train_data_tokenized = df["tokenized_text"].iloc[:train_size]
test_data_tokenized = df["tokenized_text"].iloc[train_size: train_size + test_size]
val_data_tokenized = df["tokenized_text"].iloc[train_size + test_size: train_size + test_size + val_size]

In [71]:
df["tokenized_text"].iloc[:]

0        [@VirginAmerica, plus, you've, added, commerci...
1        [@VirginAmerica, it's, really, aggressive, to,...
2        [@VirginAmerica, and, it's, a, really, big, ba...
3        [@VirginAmerica, seriously, would, pay, $30, a...
4        [@VirginAmerica, yes,, nearly, every, time, I,...
                               ...                        
11536    [@AmericanAir, my, flight, was, Cancelled, Fli...
11537    [@AmericanAir, right, on, cue, with, the, dela...
11538    [@AmericanAir, thank, you, we, got, on, a, dif...
11539    [@AmericanAir, leaving, over, 20, minutes, Lat...
11540    [@AmericanAir, you, have, my, money,, you, cha...
Name: tokenized_text, Length: 11541, dtype: object

In [72]:
train_data_tokenized.to_csv("../data/train_data.csv", index=False)
test_data_tokenized.to_csv("../data/test_data.csv", index=False)
val_data_tokenized.to_csv("../data/val_data.csv", index=False)

### Converting our data to HuggingFace Dataset format

In [73]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "../data/train_data.csv", "test": "../data/test_data.csv", "val": "../data/val_data.csv"})

Using custom data configuration default-b5bc1ecd6d7ee657


Downloading and preparing dataset csv/default to /home/anuran/.cache/huggingface/datasets/csv/default-b5bc1ecd6d7ee657/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

   

Extracting data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/anuran/.cache/huggingface/datasets/csv/default-b5bc1ecd6d7ee657/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [74]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokenized_text'],
        num_rows: 6925
    })
    test: Dataset({
        features: ['tokenized_text'],
        num_rows: 2308
    })
    val: Dataset({
        features: ['tokenized_text'],
        num_rows: 2308
    })
})

In [75]:
idx2wd = {idx: word for idx, word in enumerate(word_freq_dict.keys())}
wd2idx = {word: idx for idx, word in enumerate(word_freq_dict.keys())}

In [76]:
idx2wd

{0: '',
 1: 'hey...',
 2: '4439',
 3: 'num',
 4: 'flight🍸#sfo',
 5: 'liars!',
 6: 'felt,',
 7: 'upgd',
 8: 'soon?',
 9: 'play.',
 10: 'promise,',
 11: 'glitches',
 12: '+1',
 13: 'http://t.co/3zpjr7kwbk',
 14: 'pos',
 15: 'that??',
 16: 'offensive.',
 17: 'business,',
 18: 'sad.',
 19: '@ba_usa',
 20: '@phd_mama_',
 21: '3hours',
 22: '2470',
 23: 'cheers.',
 24: 'frustrating!!',
 25: 'news,',
 26: 'cared',
 27: '#coffeeneeded',
 28: '@nickcunningham1',
 29: '#isitthegarykellyway?',
 30: 'landing,',
 31: ';-)',
 32: 'subscribe',
 33: 'samartzis!',
 34: '#fargo',
 35: 'http://t.co/2boh2mh3cb',
 36: 'leather',
 37: 'yogurt',
 38: 'flightation',
 39: 'yest.',
 40: 'warm?',
 41: 'fine,',
 42: 'another???',
 43: 'completely',
 44: 'roc',
 45: 'switch,',
 46: '#americanair',
 47: 'direct..bag',
 48: '"a',
 49: 'serious.',
 50: 'understaffing',
 51: 'allow.',
 52: '504',
 53: 'xoxo',
 54: 'suit',
 55: '👌👌👌',
 56: 'before;',
 57: 'you!',
 58: 'yvr',
 59: '👎😬\ncustomer',
 60: 'message:',
 61: '

## Instantiate a Word2Vec Model

In [51]:
model = Word2VecSkipgramModel(vocabulary_size=len(train_data), embedding_size=300)