<a href="https://colab.research.google.com/github/agbleze/pytorch_for_deep_learning/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**chap 5: Text classification**

In [3]:
!pip install torchtext==0.9.1
!pip install torch==1.8.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.9.1
  Downloading torchtext-0.9.1-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 4.9 MB/s 
Collecting torch==1.8.1
  Downloading torch-1.8.1-cp37-cp37m-manylinux1_x86_64.whl (804.1 MB)
[K     |████████████████████████████████| 804.1 MB 2.5 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successfully uninstalled torch-1.12.1+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+cu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import torchtext
import spacy
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data



In [2]:
device = "cuda"

In [3]:
!ls

drive  sample_data


In [4]:
tweetsDF = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/training.1600000.processed.noemoticon.csv',
            engine="python", header=None, encoding="latin-1")

In [5]:
tweetsDF.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
tweetsDF[0].value_counts()

0    800000
4    800000
Name: 0, dtype: int64

In [7]:
tweetsDF["sentiment_cat"] = tweetsDF[0].astype('category')

In [8]:
tweetsDF.head(3)

Unnamed: 0,0,1,2,3,4,5,sentiment_cat
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0


In [9]:
tweetsDF["sentiment"] = tweetsDF["sentiment_cat"].cat.codes

In [10]:
tweetsDF.to_csv("train-processed.csv", header=None, index=None)

In [11]:
tweetsDF.sample(10000).to_csv("train-processed-sample.csv", header=None, index=None)

In [12]:
LABEL = data.LabelField()

In [13]:
LABEL

<torchtext.legacy.data.field.LabelField at 0x7f3ac4109c90>

In [14]:
TWEET = data.Field('spacy', tokenizer_language='en_core_web_sm', lower=True)

fields = [('score', None), ('id', None), ('date', None), ('query', None), 
          ('name', None), ('tweet', TWEET), ('category', None), ('label', LABEL)
        ]



In [16]:
tweetsDF.shape

(1600000, 8)

**create dataset and dataloaders**

In [17]:
twitterDataset = data.dataset.TabularDataset(
    path="train-processed-sample.csv",
    format="CSV",
    fields=fields,
    skip_header=False
)

In [18]:
(train, test, valid) = twitterDataset.split(split_ratio=[0.6, 0.2, 0.2],
                                            stratified=True, strata_field='label'
                                            )
(len(train), len(test), len(valid))

(6000, 2000, 2000)

In [19]:
vocab_size = 20000
TWEET.build_vocab(train, max_size = vocab_size)
LABEL.build_vocab(train)
TWEET.vocab.freqs.most_common(10)



[('i', 2810),
 ('to', 2153),
 ('the', 1934),
 ('a', 1447),
 ('my', 1210),
 ('and', 1083),
 ('you', 917),
 ('is', 905),
 ('for', 842),
 ('in', 781)]

In [22]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_sizes=32,
    #device=device,
    sort_key = lambda x: len(x.tweet),
    #sort_within_batch = False
)

TypeError: ignored

In [27]:
valid

<torchtext.legacy.data.dataset.Dataset at 0x7f1926ac0a90>

In [23]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_size = 32,
    device = device)

LSTM

In [None]:
class OurFirstLSTM(nn.Module):
  def __init__(self, hidden_size, embedding_dim, vocab_size):
    super(OurFirstLSTM, self).__init__()

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.encoder = nn.LSTM(input_size=embedding_dim, 
                           hidden_size=hidden_size, num_layers=1)
    self.predictor = nn.Linear(hidden_size, 2)

  def forward(self, seq):
    output, (hidden, _) = self.encoder(self.embedding(seq))
    preds = self.predictor(hidden.squeeze(0))
    return preds

model = OurFirstLSTM(100, 300, 20002)
model.to(device)






**Training**

In [None]:
optimizer = optim.Adam(model.parameters(), lr=2e-2)
criterion = nn.CrossEntropyLoss()

def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):
  for epoch in range(1, epochs+1):

    training_oss = 0.0
    valid_loss = 0.0
    model.train()
    for batch_idx, batch in enumerate(train_iterator):
      optimizer.zero_grad()
      predict = model(batch.tweet)
      loss = criterion(predict, batch.label)
      loss.backward()
      training_loss += loss.data.item() * batch.tweet.size(0)
    training_loss /= len(train_iterator)

    model.eval()
    for batch_idx, batch in enumerate(valid_iterator):
      predict = model(batch.tweet)
      loss = criterion(predict, batch.label)
      valid_loss += loss.data.item() * batch.tweet.size(0)

    valid_loss /= len(valid_iterator)
    print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:,2f}'.format(epoch, training_loss, valid_loss))



In [26]:
!conda install pytorch torchvision cudatoolkit=10.1 -c pytorch"

/bin/bash: -c: line 0: unexpected EOF while looking for matching `"'
/bin/bash: -c: line 1: syntax error: unexpected end of file


In [None]:
train(epochs=5, model=model, 
      optimizer=optimizer, criterion=criterion, 
      train_iterator=train_iterator,
      valid_iterator=valid_iterator
      )

Make predictions

In [None]:
from IPython.utils import process
def classify_tweet(tweet):
  categories = {0: "Negative", 1:"Positive"}
  processed = TWEET.process([TWEET.preprocess(tweet)])
  processed = processed.to(device)
  model.eval()
  return categories[model(processed).argmax().item()]

# **Data Augmentation**

In [None]:
import random

def random_deletion(words, p=0.5):
  if len(words) == 1:
    return words
  remaining = list(filter(lambda x: random.uniform(0, 1) > p, words))
  if len(remaining) == 0:
    return [random.choice(words)]
  else:
    return remaining

In [None]:
def random_swap(sentence, n=5):
  length = range(len(sentence))
  for _ in range(n):
    idx1, idx2 = random.sample(length, 2)
    sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]
  return sentence

In [None]:
!pip install googletrans==3.1.0a0

In [None]:
import googletrans

translator = googletrans.Translator()

sentences = ["The cat sat on the mat"]

translations_fr = translator.translate(sentences, dest='fr')
fr_text = [t.text for t in translations_fr]
translation_en = translator.translate(fr_text, dest='en')
en_text = [t.text for t in translation_en]
print(en_text)

available_langs = list(googletrans.LANGUAGES.keys())
tr_lang = random.choice(available_langs)
print(f"Translating to {googletrans.LANGUAGES[tr_lang]}")

translations = translator.translate(sentences, dest=tr_lang)
t_text = [t.text for t in translations]
print(t_text)

translations_en_random = translator.translate(t_text, src=tr_lang, dest='en')
en_text = [t.text for t in translations_en_random]
print(en_text)

