<a href="https://colab.research.google.com/github/antsh3k/NN-learning/blob/master/3_TODO_Unbalanced_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Sentiment Classification

In [0]:
# SWITCH TO GPU

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import torch 
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

SEED = 15
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Get the data from github 

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/w-is-h/tmp/master/dataset.csv", encoding='cp1252')
x = df['SentimentText'].values
y = df['Sentiment'].values
print(y)
print(x[0])

[1 0 1 ... 0 0 1]
first think another Disney movie, might good, it's kids movie. watch it, can't help enjoy it. ages love movie. first saw movie 10 8 years later still love it! Danny Glover superb could play part better. Christopher Lloyd hilarious perfect part. Tony Danza believable Mel Clark. can't help, enjoy movie! give 10/10!


In [0]:
# Remove mails and https links
pat_1 = r"(?:\@|https?\://)\S+"
# Remove tags
pat_2 = r'#\w+ ?'
# Combine into one regex
combined_pat = r'|'.join((pat_1, pat_2))
# Remove websites
www_pat = r'www.[^ ]+'
# Remove HTML tags
html_tag = r'<[^>]+>'
def data_cleaner(text):
  cleantags = ""
  try:
    stripped = re.sub(combined_pat, '', text)
    stripped = re.sub(www_pat, '', stripped)
    cleantags = re.sub(html_tag, '', stripped)
  except Exception as e:
    print(e)
    cleantags = "None"
  return cleantags

x_original = x
x = [data_cleaner(review) for review in x]
print(x[0])

first think another Disney movie, might good, it's kids movie. watch it, can't help enjoy it. ages love movie. first saw movie 10 8 years later still love it! Danny Glover superb could play part better. Christopher Lloyd hilarious perfect part. Tony Danza believable Mel Clark. can't help, enjoy movie! give 10/10!


# SpaCy

We can use spacy to tokenize the text and further clean it.

In [0]:
import spacy
from spacy.attrs import LOWER
# Load the english model for spacy, the disable part is used to make it faster
nlp = spacy.load('en', disable=['ner', 'parser'])

tok_snts = []
for snt in x:
  tkns = [tkn.lemma_.lower() for tkn in nlp.tokenizer(snt) if not tkn.is_punct]
  tok_snts.append(tkns)
# Save back
x = tok_snts
# Print the first sentence
print(x[0])

['\ufeff1', 'think', 'another', 'disney', 'movie', 'may', 'good', '-pron-', 'have', 'kid', 'movie', 'watch', 'it', 'can', 'not', 'help', 'enjoy', 'it', 'age', 'love', 'movie', '\ufeff1', 'see', 'movie', '10', '8', 'year', 'late', 'still', 'love', 'it', 'danny', 'glover', 'superb', 'can', 'play', 'part', 'well', 'christopher', 'lloyd', 'hilarious', 'perfect', 'part', 'tony', 'danza', 'believable', 'mel', 'clark', 'can', 'not', 'help', 'enjoy', 'movie', 'give', '10/10']


# Train word2vec

In [0]:
from gensim.models import Word2Vec
w2v = Word2Vec(x, size=300, window=6, min_count=4, workers=4)
w2v.wv.most_similar("bad")

  if np.issubdtype(vec.dtype, np.int):


[('terrible', 0.7386313080787659),
 ('awful', 0.7366094589233398),
 ('horrible', 0.7013458013534546),
 ('suck', 0.6790038347244263),
 ('lousy', 0.6755453944206238),
 ('lame', 0.6420742273330688),
 ('stupid', 0.6242562532424927),
 ('damn', 0.6126651167869568),
 ('alright', 0.6123789548873901),
 ('crappy', 0.6112472414970398)]

# Convert each sentence into the average sum of the vector representations of its tokens

Save the results into a new variable x_emb

In [0]:
# x_emb - embedded sentences
x_emb = np.zeros((len(x), 300))
# Loop over sentences
for i_snt, snt in enumerate(x):
  cnt = 0
  # Loop over the words of a sentence
  for i_word, word in enumerate(snt):
    if word in w2v.wv:
      x_emb[i_snt] += w2v.wv.get_vector(word)
      cnt += 1
  if cnt > 0:
    x_emb[i_snt] = x_emb[i_snt] / cnt
# Save the originals, will be need later
x_or = x_emb
y_or = y

# Split the dataset into train/test/dev

In [0]:
# TODO: Find the indices where y_or == 1 and y_or == 1
inds_z = np.where(y_or == 0)[0] # indices where y_or == 0, use numpy or a for loop
inds_o = np.where(y_or == 1)[0]# indices where y_or == 1, use numpy or a for loop
print(inds_z)
print(inds_o)

[    1     3     6 ... 24995 24997 24998]
[    0     2     4 ... 24993 24996 24999]


In [0]:
# TODO
x_emb = x_or[np.concatenate((inds_z, inds_o[:1000]))] # Get values from x_or so that we have 12500 negative examples and 1000 positive examples
y = y_or[np.concatenate((inds_z, inds_o[:1000]))]# Get values from y_or so that we have 12500 negative examples and 1000 positive examples

# TODO:
x_one = x_or[inds_o[:1000]] # Create x_one from x_or that has 1000 positive examples 
x_zero = x_or[inds_z] # Create x_zero from x_or that has 12500 negative examples

In [0]:
print(y.shape)
print(x_emb.shape)
print("Number of positive examples in y: " + str(np.sum(y)))

(13500,)
(13500, 300)
Number of positive examples in y: 1000


In [0]:
# Get torch stuff
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import sklearn.metrics

In [0]:
from sklearn.model_selection import train_test_split
np.random.seed(SEED)
y = y.reshape(-1)
x_train, x_test, y_train, y_test = train_test_split(x_emb, y, test_size=0.2, random_state=SEED)
x_train = torch.tensor(x_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)

x_test = torch.tensor(x_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

x_one = torch.tensor(x_one, dtype=torch.float32)
x_zero = torch.tensor(x_zero, dtype=torch.float32)

#Build the network

In [0]:
device = torch.device('cuda')
class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      self.fc1 = nn.Linear(300, 100)
      self.fc4 = nn.Linear(100, 2)
      
      self.d1 = nn.Dropout(0.5)
      
    def forward(self, x):
      x = self.d1(torch.relu(self.fc1(x)))
      x = torch.sigmoid(self.fc4(x))
      return x
# Create the network and get CE loss
net = Net()
#criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss(weight=torch.tensor([0.1, 0.9]).to(device))
# Make a SGD optimizer with lr=0.002 and momentum=0.99
optimizer = optim.SGD(net.parameters(), lr=0.02, momentum=0.99)
# Move the net to the device
net.to(device)

Net(
  (fc1): Linear(in_features=300, out_features=100, bias=True)
  (fc4): Linear(in_features=100, out_features=2, bias=True)
  (d1): Dropout(p=0.5)
)

# Train

In [0]:
# Move data to the right device
x_train = x_train.to(device)
y_train = y_train.to(device)
x_test = x_test.to(device)
y_test = y_test.to(device)
net.train()
losses = []
accs = []
accs_dev = []
for epoch in range(10000): 
  optimizer.zero_grad()
  outputs = net(x_train)
  loss = criterion(outputs, y_train)
  loss.backward()
  optimizer.step()

  if epoch % 500 == 0:
      net.eval()
      acc = sklearn.metrics.accuracy_score(torch.max(outputs, 1)[1].cpu().detach().numpy(), y_train.cpu().numpy())
      
      outputs_dev = net(x_test)
      acc_dev = sklearn.metrics.accuracy_score(torch.max(outputs_dev, 1)[1].cpu().detach().numpy(), y_test.cpu().numpy())
      accs_dev.append(acc_dev)
      
      # TODO: calculate the f1_score, precision and recall
      outputs_idx = torch.max(outputs_dev, 1)[1].cpu().detach().numpy() # Get the index of max per row
      f1_dev = sklearn.metrics.f1_score(outputs_idx, y_test.cpu().numpy()) # Use f1 from sklearn
      p_dev = sklearn.metrics.precision_score(outputs_idx, y_test.cpu().numpy()) # Use precision from sklearn
      r_dev = sklearn.metrics.recall_score(outputs_idx, y_test.cpu().numpy()) # Use recall from sklearn
      
      print("Epoch: {:4} Loss: {:.5f} Acc: {:.3f} Acc Dev: {:.3f} F1 Dev: {:.3f} p Dev: {:.3f} r Dev: {:.3f}".format(epoch, loss.item(), acc, acc_dev, f1_dev, p_dev, r_dev))
      net.train()
print('Finished Training')

Epoch:    0 Loss: 0.69183 Acc: 0.756 Acc Dev: 0.927 F1 Dev: 0.048 p Dev: 0.028 r Dev: 0.161
Epoch:  500 Loss: 0.45306 Acc: 0.868 Acc Dev: 0.861 F1 Dev: 0.420 p Dev: 0.768 r Dev: 0.289
Epoch: 1000 Loss: 0.43976 Acc: 0.875 Acc Dev: 0.861 F1 Dev: 0.419 p Dev: 0.763 r Dev: 0.288
Epoch: 1500 Loss: 0.43351 Acc: 0.885 Acc Dev: 0.869 F1 Dev: 0.432 p Dev: 0.757 r Dev: 0.302
Epoch: 2000 Loss: 0.42803 Acc: 0.890 Acc Dev: 0.871 F1 Dev: 0.430 p Dev: 0.740 r Dev: 0.303
Epoch: 2500 Loss: 0.42433 Acc: 0.895 Acc Dev: 0.873 F1 Dev: 0.430 p Dev: 0.734 r Dev: 0.304
Epoch: 3000 Loss: 0.42039 Acc: 0.899 Acc Dev: 0.875 F1 Dev: 0.430 p Dev: 0.718 r Dev: 0.307
Epoch: 3500 Loss: 0.41588 Acc: 0.902 Acc Dev: 0.875 F1 Dev: 0.430 p Dev: 0.718 r Dev: 0.307
Epoch: 4000 Loss: 0.41408 Acc: 0.905 Acc Dev: 0.878 F1 Dev: 0.434 p Dev: 0.712 r Dev: 0.312
Epoch: 4500 Loss: 0.40955 Acc: 0.909 Acc Dev: 0.880 F1 Dev: 0.433 p Dev: 0.701 r Dev: 0.313
Epoch: 5000 Loss: 0.40813 Acc: 0.911 Acc Dev: 0.883 F1 Dev: 0.438 p Dev: 0.695 r

In [0]:
np.sum(torch.max(outputs_dev, 1)[1].cpu().detach().numpy()) # the model doesnt know the importance and therefore thinks everything is a 0

0

In [0]:
tmp = torch.tensor([7, 11, 12, 33, 4, 5])
# TODO: Randomly select 3 examples from the 'tmp' tensor
sample = tmp[torch.randperm(len(tmp))[0:3]] # select a random choice
print(sample)

tensor([33,  5,  7])


In [0]:
tmp = torch.tensor([7, 11, 12, 33, 4, 5])
tmp2 = torch.tensor([23, 111])
# TODO: Concat the two tmp tensor into tmp3
tmp3 = torch.cat((tmp, tmp2)) #?
print(tmp3)

tensor([  7,  11,  12,  33,   4,   5,  23, 111])


In [0]:
# Create the network and get BCE loss
net = Net()
criterion = nn.CrossEntropyLoss()
#criterion = nn.CrossEntropyLoss(weight=[0.8, 0.2])
optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.99)
net.to(device)

# Let's do the same but with BATCHES
x_train = x_train.to(device)
y_train = y_train.to(device)
x_test = x_test.to(device)
y_test = y_test.to(device)
x_zero = x_zero.to(device)
x_one = x_one.to(device)
net.train()
losses = []
accs = []
accs_dev = []

batch_size = 1000
num_batches = int(np.ceil(len(x_train)/batch_size))# Calculate the number of batches per epoch, given the batch size above, base it on the length of x_train
for epoch in range(5000): 
  for i in range(num_batches):
    x_train_batch = torch.cat((x_one[torch.randperm(len(x_one))[0:500]], x_zero[torch.randperm(len(x_zero))[0:500]])) # Randomly select 500 positve and 500 negative examples, use x_one and x_zero
    y_train_batch = torch.zeros(1000, dtype=torch.long) # Create the corresponding labels 
    y_train_batch[0:500] = 1
    y_train_batch = y_train_batch.to(device)
    
    optimizer.zero_grad()
    outputs = net(x_train_batch)
    loss = criterion(outputs, y_train_batch)
    loss.backward()
    optimizer.step()

  if epoch % 500 == 0:
      net.eval()
      outputs = net(x_train)
      acc = sklearn.metrics.accuracy_score([1 if x > 0.5 else 0 for x in torch.max(outputs, 1)[1].cpu().detach().numpy()], y_train.cpu().numpy())

      outputs_dev = net(x_test)
      f1_dev = f1_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      p_dev = precision_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      r_dev = recall_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      
      print("Epoch: {:4} Loss: {:.5f} Acc: {:.3f} Acc Dev: {:.3f} F1 Dev: {:.3f} p Dev: {:.3f} r Dev: {:.3f}".format(epoch, loss.item(), acc, acc_dev, f1_dev, p_dev, r_dev))
      net.train()
print('Finished Training')

Epoch:    0 Loss: 0.69046 Acc: 0.160 Acc Dev: 0.903 F1 Dev: 0.133 p Dev: 0.071 r Dev: 0.989
Epoch:  500 Loss: 0.40421 Acc: 0.920 Acc Dev: 0.903 F1 Dev: 0.587 p Dev: 0.432 r Dev: 0.915
Epoch: 1000 Loss: 0.39713 Acc: 0.940 Acc Dev: 0.903 F1 Dev: 0.671 p Dev: 0.526 r Dev: 0.927
Epoch: 1500 Loss: 0.37480 Acc: 0.953 Acc Dev: 0.903 F1 Dev: 0.722 p Dev: 0.592 r Dev: 0.927
Epoch: 2000 Loss: 0.36843 Acc: 0.960 Acc Dev: 0.903 F1 Dev: 0.758 p Dev: 0.641 r Dev: 0.927
Epoch: 2500 Loss: 0.37429 Acc: 0.965 Acc Dev: 0.903 F1 Dev: 0.788 p Dev: 0.686 r Dev: 0.927
Epoch: 3000 Loss: 0.36586 Acc: 0.971 Acc Dev: 0.903 F1 Dev: 0.820 p Dev: 0.735 r Dev: 0.927
Epoch: 3500 Loss: 0.37114 Acc: 0.972 Acc Dev: 0.903 F1 Dev: 0.816 p Dev: 0.729 r Dev: 0.927
Epoch: 4000 Loss: 0.36633 Acc: 0.976 Acc Dev: 0.903 F1 Dev: 0.828 p Dev: 0.749 r Dev: 0.927
Epoch: 4500 Loss: 0.36650 Acc: 0.977 Acc Dev: 0.903 F1 Dev: 0.839 p Dev: 0.766 r Dev: 0.927
Finished Training
