#Mount device#

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive

/content/drive/MyDrive


In [None]:
!pwd

/content/drive/MyDrive


#Import#

In [None]:
from collections import Counter
import re
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import tensorflow as tf

import nltk

from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from sklearn.feature_extraction.text import TfidfVectorizer

#Define Constants#

In [None]:
MAX_NB_WORDS = 100000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 100 # max length of each entry (sentence), including padding
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "glove.6B."+str(EMBEDDING_DIM)+"d.txt"

#Load Data#

In [None]:
train = pd.read_csv('BALANCED.csv') #change to dataset here
y = train['period'].values
lyrics_train = train['lyrics']
lyrics_train = list(lyrics_train)

labels = [60, 70, 80, 90, 0, 10]

In [None]:
print(len(lyrics_train))

25078


In [None]:
texts = [] 
y_final = []
num = 0

for line in tqdm_notebook(lyrics_train): 
    if type(line) != float:
      texts.append(line)
      y_final.append(y[num])
    num += 1

y = np.array(y_final)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/25078 [00:00<?, ?it/s]

In [None]:
print('Sample data:', texts[753], y[753])

Sample data: read news today oh boy lucky man made grade though news rather sad well laugh saw photograph blew mind car notice light changed crowd people stood stared seen face nobody really sure house lord 60.0


#Tokenize#

In [None]:
for i, ex in enumerate(texts):
  if (type(ex) == float):
    print(i, ex)

## TF-IDF

In [None]:
vectorizer = TfidfVectorizer( max_features=MAX_NB_WORDS)
vectorizer.fit(texts)

vector = vectorizer.transform(texts)
print(type(vector))

<class 'scipy.sparse.csr.csr_matrix'>


#Padding

In [None]:
data = vector.toarray()
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

Shape of data tensor: (25062, 38585)
Shape of label tensor: (25062,)


In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y[indices]

In [None]:
# convert to one hot encoding 
one_hot_labels = []
for lab in labels:
  lab_arr = [0, 0, 0, 0, 0, 0] #60, 70, 80, 90, 0, 10
  if lab > 50:
    lab_arr[int((lab - 60) / 10)] = 1
  elif lab == 0:
    lab_arr[4] = 1
  else:
    lab_arr[5] = 1

  one_hot_labels.append(lab_arr)

one_hot_labels = np.array(one_hot_labels)
print(one_hot_labels[0])

[1 0 0 0 0 0]


In [None]:
print(type(data))

<class 'numpy.ndarray'>


In [None]:
print('Tokenized sentences: \n', data[2])
print('Label: \n', labels[2], one_hot_labels[2])

Tokenized sentences: 
 [0. 0. 0. ... 0. 0. 0.]
Label: 
 60.0 [1 0 0 0 0 0]


In [None]:
print(data.shape)
print(one_hot_labels.shape)

(1000, 4360)
(1000, 6)


In [None]:
# Converting our labels into numpy arrays
lyrics = np.array(data)
labels = np.array(one_hot_labels)

In [None]:
print(lyrics[:20])
print(labels[:20])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]]


#Partition the Data (train, val, test)#

In [None]:
split_frac = 0.7 # 70% train, 30% test(val + test)
split_id = int(split_frac * len(lyrics))
train_lyrics, test_lyrics = lyrics[:split_id], lyrics[split_id:]
train_labels, test_labels = labels[:split_id], labels[split_id:]

In [None]:
print(train_labels.shape)
print(train_lyrics.shape)

(700, 6)
(700, 4360)


In [None]:
split_frac = 0.5 # 50% validation, 50% test
split_id = int(split_frac * len(test_lyrics))
val_lyrics, test_lyrics = test_lyrics[:split_id], test_lyrics[split_id:]
val_labels, test_labels = test_labels[:split_id], test_labels[split_id:]

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

train_data = TensorDataset(torch.from_numpy(train_lyrics), torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_lyrics), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(test_lyrics), torch.from_numpy(test_labels))

batch_size = 16 
hidden = 64
epochs = 30 
lr=0.0001
dropout=0.5

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

In [None]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
class SentimentNet(nn.Module):
    def __init__(self,
                 weight_matrix=None,
                 vocab_size=None, 
                 output_size=1,  
                 hidden_dim=512, 
                 embedding_dim=400, 
                 n_layers=1, 
                 dropout_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        # initialize the representation to pass to the LSTM
        self.embedding, embedding_dim = self.init_embedding(
            vocab_size, 
            embedding_dim, 
            weight_matrix)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout_prob, batch_first=True)
        # dropout
        self.dropout = nn.Dropout(dropout_prob)
        # fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.Softmax(dim = -1)
        
    def forward(self, x, hidden):
        # forward pass of the network
        batch_size = x.size(0)
        # transform input
        embeds = self.embedding(x)
        # run input embedding + hidden state through model
        lstm_out, hidden = self.lstm(embeds, hidden)
        # reshape
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout certain pct of connections
        out = self.dropout(lstm_out)
        # fully connected layer
        out = self.fc(out)
        # connect to 6 outputs (one_hot_encoding)
        # activation function
        out = self.softmax(out)
        out = out.view(batch_size, int(out.shape[0] / batch_size), out.shape[1])
        out = out[:,-1]
        # return the output and the hidden state
        return out, hidden, lstm_out
    
    def init_embedding(self, vocab_size, embedding_dim, weight_matrix):
        # initializes the embedding
        if weight_matrix is None:
            if vocab_size is None:
                raise ValueError('If no weight matrix, need a vocab size')
            # if embedding is a size, initialize trainable
            return(nn.Embedding(vocab_size, embedding_dim),
                   embedding_dim)
        else:
            # otherwise use matrix as pretrained
            weights = torch.FloatTensor(weight_matrix)
            return(nn.Embedding.from_pretrained(weights),
                  weights.shape[1])
    
    def init_hidden(self, batch_size):
        # initializes the hidden state
        hidden = (torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device),
                  torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device))
        return hidden

In [None]:
model_params = {'weight_matrix': None,
                'vocab_size': data.shape[1] + 1,
               'output_size': 6,
               'hidden_dim': hidden,
               'n_layers': 2,
               'embedding_dim': EMBEDDING_DIM,
               'dropout_prob': dropout}
model = SentimentNet(**model_params)

model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
counter = 0
print_every = 10
clip = 5
valid_loss_min = np.Inf

model.train()

train_losses = []
val_losses = []
train_acc = []
val_acc = []

for i in range(epochs):
    print("Epoch")
    h = model.init_hidden(batch_size)
    train_correct = 0
    val_correct = 0
    calculated = 0
    train_loss = []
    val_loss = []
    train_count = 0
    val_count = 0

    periods = [0, 0, 0, 0, 0, 0]
    periods_gt = [0, 0, 0, 0, 0, 0]
    
    for inputs, labels in train_loader:
        inputs = inputs.clone().detach().to(torch.int64)
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h, lstm_out = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())

        train_loss.append(loss.item())

        for j, ex in enumerate(output):
          train_count += 1
          gt_label = torch.argmax(labels[j].float()) 
          pred_label = torch.argmax(ex.squeeze())

          periods[pred_label] += 1
          periods_gt[gt_label] += 1

          if pred_label.eq(gt_label):
            train_correct += 1

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
    val_h = model.init_hidden(batch_size)
    model.eval()

    for inp, lab in val_loader:
        inp = inp.clone().detach().to(torch.int64)
        val_h = tuple([each.data for each in val_h])
        inp, lab = inp.to(device), lab.to(device)
        out, val_h, _ = model(inp, val_h)
        loss_val = criterion(out.squeeze(), lab.float())

        val_loss.append(loss_val.item())

        for j, ex in enumerate(out):
          val_count += 1
          gt_label = torch.argmax(lab[j].float()) 
          pred_label = torch.argmax(ex.squeeze())

          if pred_label.eq(gt_label):
            val_correct += 1
        
    model.train()

    print("Epoch: {}/{}".format(i+1, epochs),
          "Loss: {:.6f}".format(np.mean(train_loss)),
          "Val Loss: {:.6f}".format(np.mean(val_loss)),
          "Acc: {:.6f}".format(train_correct/train_count),
          "Val Acc: {:.6f}".format(val_correct/val_count))
    print(train_correct)
    print(train_count)
    
    if np.mean(val_loss) <= valid_loss_min:
        torch.save(model.state_dict(), './state_dict_' + str(EMBEDDING_DIM) + '.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model.'.format(valid_loss_min,np.mean(val_loss)))
        valid_loss_min = np.mean(val_loss)

    train_losses.append(np.mean(train_loss))
    val_losses.append(np.mean(val_loss))
    train_acc.append(train_correct/train_count)
    val_acc.append(val_correct/val_count)
    print(periods)
    print(periods_gt)

Epoch
Epoch: 1/30 Loss: 1.703726 Val Loss: 1.613644 Acc: 0.984012 Val Acc: 1.000000
677
688
Validation loss decreased (inf --> 1.613644).  Saving model.
[677, 6, 1, 4, 0, 0]
[688, 0, 0, 0, 0, 0]
Epoch
Epoch: 2/30 Loss: 1.455494 Val Loss: 1.223298 Acc: 1.000000 Val Acc: 1.000000
688
688
Validation loss decreased (1.613644 --> 1.223298).  Saving model.
[688, 0, 0, 0, 0, 0]
[688, 0, 0, 0, 0, 0]
Epoch
Epoch: 3/30 Loss: 1.162593 Val Loss: 1.092557 Acc: 1.000000 Val Acc: 1.000000
688
688
Validation loss decreased (1.223298 --> 1.092557).  Saving model.
[688, 0, 0, 0, 0, 0]
[688, 0, 0, 0, 0, 0]
Epoch
Epoch: 4/30 Loss: 1.090637 Val Loss: 1.067114 Acc: 1.000000 Val Acc: 1.000000
688
688
Validation loss decreased (1.092557 --> 1.067114).  Saving model.
[688, 0, 0, 0, 0, 0]
[688, 0, 0, 0, 0, 0]
Epoch


KeyboardInterrupt: ignored

In [None]:
 # Loading the best model
model.load_state_dict(torch.load('./state_dict_' + str(EMBEDDING_DIM) + '.pt'))

test_losses = []
periods = [0, 0, 0, 0, 0, 0]
gt_periods = [0, 0, 0, 0, 0, 0]
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader:
    inputs = inputs.clone().detach().to(torch.int64)
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h, _ = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    for i, ex in enumerate(output):
      gt_label = torch.argmax(labels[i].float()) 
      pred_label = torch.argmax(ex.squeeze())

      periods[pred_label] += 1
      gt_periods[gt_label] += 1

      if pred_label.eq(gt_label):
        num_correct += 1

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))
print(num_correct)
print(len(test_loader.dataset))
print(periods)
print(gt_periods)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 


plt.figure(figsize=(10,5))
plt.title("Training and Validation Loss for Embeddigns = " + str(EMBEDDING_DIM))
plt.plot(val_losses,label="val")
plt.plot(train_losses,label="train")
plt.xlabel("epochs")
plt.ylabel("Loss")
plt.legend()
plt.savefig("tfidf_loss_" + str(epochs)+"_"+ str(batch_size) + "_" + str(lr)+ "_" + str(EMBEDDING_DIM) + "_" + str(hidden) + ".jpg")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title("Training and Validation Accuracy for Embeddigns = " + str(EMBEDDING_DIM))
plt.plot(val_acc,label="val")
plt.plot(train_acc,label="train")
plt.xlabel("epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig("tfidf_acc_" + str(epochs)+"_"+ str(batch_size) + "_" + str(lr)+ "_" + str(EMBEDDING_DIM) + "_" + str(hidden) + ".jpg")
plt.show()