# 1. Load Data and Import Libraries

In [1674]:
#string matching
import re 

#reading files
import pandas as pd

#array processing
import numpy as np

#visualization
import matplotlib.pyplot as plt  

#for metrics
from sklearn import metrics

#for seed
import random

# to one hot encode labels
from sklearn.preprocessing import MultiLabelBinarizer

#defining tensors
import torch

#layers
from torch import nn

#layers and wrappers
from torch.nn import Sequential, Linear,  ReLU, Sigmoid, Dropout, BCELoss, Embedding, RNN, LSTM, LeakyReLU

#handling text data
from torchtext import data 
from torchtext.legacy import data

In [1675]:
# load the dataset
questions_df = pd.read_csv('train_data_merged_labels.csv')


In [1676]:
# load the test dataset
test_df = pd.read_csv('test_data.csv')

In [1677]:
#Glance at the first 5 rows

questions_df.head()

Unnamed: 0,utterances,Core Relations
0,who plays luke on star wars new hope,movie.starring.actor_movie.starring.character
1,show credits for the godfather,movie.starring.actor
2,who was the main actor in the exorcist,movie.starring.actor
3,who played dory on finding nemo,movie.starring.actor_movie.starring.character
4,who was the female lead in resident evil,actor.gender_movie.starring.actor


In [1678]:
#shape of the dataset
questions_df.shape
#print(type(questions_df['Core Relations']))

(2253, 2)

In [1679]:
#Take a glance at first 5 rows
test_df.head()

Unnamed: 0,utterances
0,star of thor
1,who is in the movie the campaign
2,list the cast of the movie the campaign
3,who was in twilight
4,who is in vulguria


In [1680]:
# No. of unique tags
len(questions_df['Core Relations'].unique())

47

# 2. Dataset Preparation

## 2.1 Filter Questions with respect to Top-20 Tags

In [1681]:
# check occurence of each tag
freq={}
for i in questions_df['Core Relations']:
  #for j in i:
  if i in freq.keys():
    freq[i] = freq[i] + 1
  else:
    freq[i] = 1

In [1682]:
# sort the dictionary in descending order
freq = dict(sorted(freq.items(), key=lambda x:x[1],reverse=True))
print(freq)

{'movie.directed_by': 314, 'none': 312, 'movie.starring.actor': 294, 'movie.rating': 186, 'movie.produced_by': 170, 'movie.initial_release_date': 141, 'movie.language': 137, 'movie.country': 135, 'movie.genre': 100, 'movie.subjects': 90, 'movie.production_companies': 84, 'movie.country_movie.language': 72, 'movie.estimated_budget': 67, 'movie.gross_revenue': 25, 'movie.starring.actor_movie.starring.character': 19, 'movie.initial_release_date_movie.starring.actor': 12, 'movie.initial_release_date_movie.rating': 12, 'actor.gender_movie.starring.actor': 8, 'movie.directed_by_movie.initial_release_date': 7, 'movie.directed_by_movie.starring.actor': 6, 'movie.initial_release_date_movie.production_companies': 6, 'movie.country_movie.genre': 6, 'movie.country_movie.genre_movie.language': 5, 'gr.amount_movie.gross_revenue': 5, 'person.date_of_birth': 4, 'movie.genre_movie.rating': 4, 'movie.starring.character': 3, 'movie.music': 3, 'movie.directed_by_movie.genre': 3, 'movie.genre_movie.languag

In [1683]:
# Top 10 most frequent tags
common_tags = list(freq.keys())[:20]
print(common_tags)


['movie.directed_by', 'none', 'movie.starring.actor', 'movie.rating', 'movie.produced_by', 'movie.initial_release_date', 'movie.language', 'movie.country', 'movie.genre', 'movie.subjects', 'movie.production_companies', 'movie.country_movie.language', 'movie.estimated_budget', 'movie.gross_revenue', 'movie.starring.actor_movie.starring.character', 'movie.initial_release_date_movie.starring.actor', 'movie.initial_release_date_movie.rating', 'actor.gender_movie.starring.actor', 'movie.directed_by_movie.initial_release_date', 'movie.directed_by_movie.starring.actor']


In [1684]:
#finding utterances associated with common tags
x=[]
y=[]

for i in range(len(questions_df['Core Relations'])):  
  temp=[]
  if questions_df['Core Relations'][i] in common_tags:
    temp.append(questions_df['Core Relations'][i])
    x.append(questions_df['utterances'][i])
    y.append(temp)
    

In [1685]:
# number of utterances left
len(x)

2191

In [1686]:
#first 5 tags
y[:5]

[['movie.starring.actor_movie.starring.character'],
 ['movie.starring.actor'],
 ['movie.starring.actor'],
 ['movie.starring.actor_movie.starring.character'],
 ['actor.gender_movie.starring.actor']]

In [1687]:
#combining the labels by space
y = [ ",".join([str(j) for j in i ]) for i in y]

In [1688]:
#labels after converting to string
y[:5]

['movie.starring.actor_movie.starring.character',
 'movie.starring.actor',
 'movie.starring.actor',
 'movie.starring.actor_movie.starring.character',
 'actor.gender_movie.starring.actor']

In [1689]:
#save to dataframe
dframe = pd.DataFrame({'utterances':x,'tags':y})

In [1690]:
#first 5 rows
dframe.head()

Unnamed: 0,utterances,tags
0,who plays luke on star wars new hope,movie.starring.actor_movie.starring.character
1,show credits for the godfather,movie.starring.actor
2,who was the main actor in the exorcist,movie.starring.actor
3,who played dory on finding nemo,movie.starring.actor_movie.starring.character
4,who was the female lead in resident evil,actor.gender_movie.starring.actor


In [1691]:
#save to csv
dframe.to_csv('stack.csv',index=False)

# 3. Text Preprocessing

## 3.1 Text Cleaning

In [1692]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def cleaner(text):

  text = BeautifulSoup(text).get_text()
  
  # fetch alphabetic characters
  text = re.sub("[^a-zA-Z]", " ", text)

  # convert text to lower case
  text = text.lower()
  
  # split text into tokens to remove whitespaces
  tokens = text.split()
  tokens = [w for w in tokens if not w in stop_words]
  #tokens = ' '.join(no_stopword_text)
  
  return tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1693]:
#define field object for utterances
max_len = 100
TEXT = data.Field(tokenize=cleaner, batch_first=True, fix_length=max_len)

In [1694]:
#define field object for label
LABEL = data.LabelField(batch_first=True)

In [1695]:
#define a list of tuple with field objects
fields = [('utterances',TEXT),('tags', LABEL)]

In [1696]:
#reading the dataset
training_data = data.TabularDataset(path = 'stack.csv', format = 'csv', fields = fields, skip_header = True)

In [1697]:
print(training_data)

<torchtext.legacy.data.dataset.TabularDataset object at 0x7feae8f7b810>


In [1698]:
#print preprocessed text
print(vars(training_data.examples[0]))

{'utterances': ['plays', 'luke', 'star', 'wars', 'new', 'hope'], 'tags': 'movie.starring.actor_movie.starring.character'}


In [1699]:
train_data, valid_data = training_data.split(split_ratio=0.8, random_state = random.seed(32))

## 3.2 Text Representation

In [1700]:
#preparing the vocabulary for the text
TEXT.build_vocab(train_data, min_freq=3)

In [1701]:
#No. of unique words
len(TEXT.vocab)

343

In [1702]:
#word index
#list(TEXT.vocab.stoi.items())

In [1703]:
def fetch_text(examples):

  text=[]
  for example in examples:
    utterance = vars(example)['utterances']
    text.append(utterance)
    
  return text

In [1704]:
train_text = fetch_text(train_data)
valid_text = fetch_text(valid_data)
print(train_text)

[['release', 'date', 'vow'], ['find', 'romance', 'movies'], ['genre', 'movie'], ['find', 'movies', 'hugh', 'jackman'], ['want', 'see', 'infomation', 'movie', 'dirty', 'dancing'], ['goto', 'lionsgate'], ['played', 'esme', 'twilight'], ['dark', 'night', 'budget'], ['release', 'date', 'watch'], ['home', 'movie', 'english'], ['producer', 'amazing', 'spiderman'], ['search', 'warner', 'brothers', 'movies'], ['many', 'producers', 'movie', 'hitch'], ['find', 'rating', 'movie', 'step'], ['open', 'page', 'ferrell'], ['search', 'r', 'rated', 'movies'], ['played', 'dory', 'finding', 'nemo'], ['find', 'director', 'et'], ['audience', 'watch', 'black', 'swan'], ['see', 'cast', 'crew', 'movie', 'july'], ['producer', 'hitch'], ['want', 'watch', 'movies', 'germany'], ['show', 'movies', 'produced', 'stephen', 'spielberg'], ['canadian', 'movies'], ['please', 'search', 'movies', 'god'], ['please', 'tell', 'karate', 'kid', 'released'], ['finding', 'nemo'], ['made', 'field', 'dreams'], ['barfi'], ['age', 'bl

In [1705]:
def convert2seq(text):
  
  #padding
  text = TEXT.pad(text)
  
  #converting to numbers
  text = TEXT.numericalize(text)
  
  return text

In [1706]:
X_train = convert2seq(train_text)
X_valid = convert2seq(valid_text)

In [1707]:
X_train[0]

tensor([44, 35,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [1708]:
X_train.shape, X_valid.shape

(torch.Size([1753, 100]), torch.Size([438, 100]))

In [1709]:
def fetch_tags(data):
  tags=[]
  for example in data.examples:
    tags.append(vars(example)['tags'])
  return tags

In [1710]:
train_tags = fetch_tags(train_data)
valid_tags = fetch_tags(valid_data)

In [1711]:
train_tags[:5]

['movie.initial_release_date',
 'movie.genre',
 'movie.genre',
 'movie.starring.actor',
 'none']

In [1712]:
#preparing the output labels 
train_tags_list=[i.split(",") for i in train_tags]
valid_tags_list=[i.split(",") for i in valid_tags]

In [1713]:
mlb= MultiLabelBinarizer()
mlb.fit(train_tags_list)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [1714]:
mlb.classes_

array(['actor.gender_movie.starring.actor', 'movie.country',
       'movie.country_movie.language', 'movie.directed_by',
       'movie.directed_by_movie.initial_release_date',
       'movie.directed_by_movie.starring.actor', 'movie.estimated_budget',
       'movie.genre', 'movie.gross_revenue', 'movie.initial_release_date',
       'movie.initial_release_date_movie.rating',
       'movie.initial_release_date_movie.starring.actor',
       'movie.language', 'movie.produced_by',
       'movie.production_companies', 'movie.rating',
       'movie.starring.actor',
       'movie.starring.actor_movie.starring.character', 'movie.subjects',
       'none'], dtype=object)

In [1715]:
y_train  = mlb.transform(train_tags_list)
y_valid  = mlb.transform(valid_tags_list)

In [1716]:
y_train.shape, y_valid.shape

((1753, 20), (438, 20))

In [1717]:
type(y_train)

numpy.ndarray

In [1718]:
y_train = torch.FloatTensor(y_train)
y_valid = torch.FloatTensor(y_valid)

In [1719]:
type(y_train)

torch.Tensor

# 4. Model Building

 ## 4.1 Model Architecture

In [1720]:
# define embedding layer
emb = Embedding(num_embeddings=len(TEXT.vocab), embedding_dim=50)

In [1721]:
X_train[:1].shape

torch.Size([1, 100])

In [1722]:
# check sample input
sample_embedding = emb(X_train[:1])

In [1723]:
sample_embedding.shape

torch.Size([1, 100, 50])

In [1724]:
#define a rnn
rnn = RNN(input_size=50, hidden_size=256, batch_first=True, nonlinearity='relu')

In [1725]:
#pass the input to rnn
hidden_states,last_hidden_state = rnn(sample_embedding)

In [1726]:
#Hidden state of every timestep (Batch, seq_len, no. of hidden neurons)
hidden_states.shape

torch.Size([1, 100, 256])

In [1727]:
#output shape of last hidden timestep
last_hidden_state.shape

torch.Size([1, 1, 256])

In [1728]:
#reshaping the hidden states
reshaped = hidden_states.reshape(hidden_states.size(0),-1)
reshaped.shape

torch.Size([1, 25600])

In [1729]:
# Define Model Architecture

# Input
# Embedding(embedding_dim=50)
# RNN(128)
# Linear(128, 'relu')
# Linear(10, 'sigmoid')

class Net(nn.Module):
    
    #define all the layers used in model
    def __init__(self):
        
        #Constructor
        super(Net, self).__init__()   
        
        self.rnn_layer = nn.Sequential(
            
            #embedding layer [batch_size,vocab_size]
            Embedding(num_embeddings=len(TEXT.vocab), embedding_dim=50),
        
            #rnn layer [batch_size,100,128]
            RNN(input_size=50, hidden_size=256, nonlinearity='relu',batch_first=True)
          
            )

        self.dense_layer = nn.Sequential(
            
            #[batch_size,100*128]
            Linear(25600, 256),
            LeakyReLU(),
            #ReLU(),

            #[batch_size,128]
            Linear(256,20),
            
            #[batch_size,10]
            Sigmoid()

        )

    def forward(self, x):
        
        #rnn layer
        hidden_states, last_hidden_state = self.rnn_layer(x)

        #reshaping
        hidden_states = hidden_states.reshape(hidden_states.size(0),-1)

        #dense layer
        outputs=self.dense_layer(hidden_states)
        
        return outputs

In [1730]:
#define the model
model = Net()

In [1731]:
#model layers
model

Net(
  (rnn_layer): Sequential(
    (0): Embedding(343, 50)
    (1): RNN(50, 256, batch_first=True)
  )
  (dense_layer): Sequential(
    (0): Linear(in_features=25600, out_features=256, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=256, out_features=20, bias=True)
    (3): Sigmoid()
  )
)

In [1732]:
#pass an text to the model to understand the output
#deactivates autograd
with torch.no_grad():
  pred = model(X_train[:1])
  print(pred)

tensor([[0.5272, 0.4884, 0.5029, 0.4714, 0.5037, 0.4978, 0.5087, 0.4964, 0.4896,
         0.5119, 0.4920, 0.4986, 0.4958, 0.4955, 0.4716, 0.4545, 0.5071, 0.4891,
         0.5105, 0.4738]])


In [1733]:
#define optimizer and loss
optimizer = torch.optim.Adam(model.parameters())
criterion = BCELoss()

# checking if GPU is available
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

## 4.2 Model Training

In [1734]:
# define training function
def train(X,y,batch_size):

  #activate training phase
  model.train()
  
  #initialization
  epoch_loss= 0
  no_of_batches = 0

  #randomly create indices
  indices= torch.randperm(len(X))
  
  #loading in batches
  for i in range(0,len(indices),batch_size):
    
    #indices for a batch
    ind = indices[i:i+batch_size]
  
    #batch  
    batch_x=X[ind]
    batch_y=y[ind]
    
    #push to cuda
    if torch.cuda.is_available():
        batch_x, batch_y = batch_x.cuda(), batch_y.cuda()

    #clear gradients
    optimizer.zero_grad()
          
    #forward pass
    outputs = model(batch_x)

    #converting to a 1 dimensional tensor
    outputs = outputs.squeeze()

    #calculate loss and accuracy
    loss = criterion(outputs, batch_y)
    
    #Backward pass
    loss.backward()
    
    #Update weights
    optimizer.step()

    #Keep track of the loss and accuracy of a epoch
    epoch_loss = epoch_loss + loss.item()

    #No. of batches
    no_of_batches = no_of_batches+1

  return epoch_loss/no_of_batches

In [1735]:
# define evaluation function
def evaluate(X,y,batch_size):

  #deactivate training phase
  model.eval()

  #initialization
  epoch_loss = 0
  no_of_batches = 0

  #randomly create indices
  indices= torch.randperm(len(X))

  #deactivates autograd
  with torch.no_grad():
    
    #loading in batches
    for i in range(0,len(indices),batch_size):
      
      #indices for a batch
      ind = indices[i:i+batch_size]
  
      #batch  
      batch_x= X[ind]
      batch_y= y[ind]

      #push to cuda
      if torch.cuda.is_available():
          batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
        
      #Forward pass
      outputs = model(batch_x)

      #converting the output to 1 Dimensional tensor
      outputs = outputs.squeeze()

      # Calculate loss and accuracy
      loss = criterion(outputs, batch_y)
      
      #keep track of loss and accuracy of an epoch
      epoch_loss = epoch_loss + loss.item()

      #no. of batches
      no_of_batches = no_of_batches + 1

    return epoch_loss/no_of_batches

In [1736]:
# define prediction function
def predict(X,batch_size):
  
  #deactivate training phase
  model.eval()

  # initialization 
  predictions = []

  # create indices
  indices = torch.arange(len(X))

  #deactivates autograd
  with torch.no_grad():
      
      for i in range(0, len(X), batch_size):
        
        #indices for a batch
        ind = indices[i:i+batch_size]

        # batch
        batch_x = X[ind]

        #push to cuda
        if torch.cuda.is_available():
            batch_x = batch_x.cuda()

        #Forward pass
        outputs = model(batch_x)

        #converting the output to 1 Dimensional tensor
        outputs = outputs.squeeze()

        # convert to numpy array
        prediction = outputs.data.cpu().numpy()
        predictions.append(prediction)
    
  # convert to single numpy array
  predictions = np.concatenate(predictions, axis=0)
    
  return predictions

In [1737]:
N_EPOCHS = 300
batch_size = 32

# intialization
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss   = train(X_train, y_train, batch_size)
    
    #evaluate the model
    valid_loss   = evaluate(X_valid, y_valid, batch_size)

    print('\nEpoch :',epoch,
          'Training loss:',round(train_loss,4),
          '\tValidation loss:',round(valid_loss,4))

    #save the best model
    if best_valid_loss >= valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt') 
        print("\n----------------------------------------------------Saved best model------------------------------------------------------------------")   


Epoch : 0 Training loss: 0.19 	Validation loss: 0.1483

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 1 Training loss: 0.1356 	Validation loss: 0.1165

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 2 Training loss: 0.0926 	Validation loss: 0.0977

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 3 Training loss: 0.0736 	Validation loss: 0.0838

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 4 Training loss: 0.0877 	Validation loss: 0.0808

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 5 Training loss: 0.0476

# 5. Model Evaluation

## 5.1 Check Performance

In [1738]:
#load weights of best model
path='saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [1739]:
#predict probabilities
y_pred_prob = predict(X_valid, batch_size)

In [1740]:
y_pred_prob[0]

array([6.8877488e-03, 4.3003224e-02, 7.7300688e-04, 1.2996644e-02,
       7.7309897e-03, 1.3209004e-03, 2.6316705e-04, 2.1307522e-02,
       1.8413264e-02, 1.1947645e-01, 2.6186709e-03, 5.3105699e-03,
       7.3832716e-03, 2.5450474e-02, 1.3240005e-03, 2.1078223e-02,
       3.5433483e-01, 5.0122094e-01, 3.7236453e-03, 1.2049272e-02],
      dtype=float32)

In [1741]:
#actual tags
y_true = y_valid.cpu().numpy()

In [1742]:
#define candidate threshold values
threshold  = np.arange(0,0.5,0.01)
print(threshold)

[0.   0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1  0.11 0.12 0.13
 0.14 0.15 0.16 0.17 0.18 0.19 0.2  0.21 0.22 0.23 0.24 0.25 0.26 0.27
 0.28 0.29 0.3  0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4  0.41
 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49]


In [1743]:
# convert probabilities into classes or tags based on a threshold value
def classify(y_pred_prob, thresh):
  
  y_pred = []

  for i in y_pred_prob:
    temp=[]
      
    for j in i:
      if j>=thresh:
        temp.append(1)
      else:
        temp.append(0)
    
    y_pred.append(temp)

  return np.array(y_pred)

In [1744]:
score=[]

for thresh in threshold:
    
    #classes for each threshold
    y_pred = classify(y_pred_prob, thresh) 

    #convert to 1d array
    y_pred_1d    =  y_pred.ravel()
    y_true_1d    =  y_true.ravel()
 
    score.append(metrics.f1_score(y_true_1d, y_pred_1d))

In [1745]:
# find the optimal threshold
opt = threshold[score.index(max(score))]
print(opt)

0.22


In [1746]:
#predictions for optimal threshold
y_pred = classify(y_pred_prob, opt)

In [1747]:
#converting to 1D
y_pred_1d = y_pred.ravel()

#Classification report
print(metrics.classification_report(y_true_1d, y_pred_1d))

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99      8322
         1.0       0.73      0.79      0.76       438

    accuracy                           0.97      8760
   macro avg       0.86      0.89      0.87      8760
weighted avg       0.98      0.97      0.98      8760



In [1748]:
#convert back to tags
y_pred_label = mlb.inverse_transform(np.array(y_pred))
y_true_label = mlb.inverse_transform(np.array(y_true))

# get all validation text
utterances = [" ".join(i) for i in valid_text]

# create a dataframe to show the data and prediction side by side
df = pd.DataFrame({'Utterances':utterances,'Actual Tags':y_true_label,'Predicted Tags':y_pred_label})

# print first five rows
df.head()

Unnamed: 0,Utterances,Actual Tags,Predicted Tags
0,played oranges,"(movie.starring.actor,)","(movie.starring.actor, movie.starring.actor_movie.starring.character)"
1,display films arts,"(movie.production_companies,)","(none,)"
2,russell crowe movies,"(movie.starring.actor,)","(movie.starring.actor,)"
3,director avatar,"(movie.directed_by,)","(movie.directed_by,)"
4,show r rated movies,"(movie.rating,)","(movie.rating,)"


## 5.2 Show Inference

In [1749]:
#raw text
text = "Actor in movie thor"

In [1750]:
#cleaning text
tokens = cleaner(text)
tokens[:5]

['actor', 'movie', 'thor']

In [1751]:
#first argument to the model is no. of samples
tokens = np.array(tokens).reshape(-1,len(tokens))
tokens.shape

(1, 3)

In [1752]:
#converting text to integer sequences
seq = convert2seq(tokens)
seq

tensor([[63,  3,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1]])

In [1753]:
#predictions
with torch.no_grad():
  if torch.cuda.is_available():
    seq = seq.cuda()
  pred_prob= model(seq)
  print(pred_prob)

tensor([[9.3083e-03, 9.7914e-03, 1.2259e-02, 9.1355e-03, 2.6653e-03, 7.1012e-04,
         1.8626e-03, 1.3562e-02, 1.0926e-02, 2.6565e-02, 1.6477e-03, 1.2186e-02,
         7.3288e-03, 3.1756e-02, 1.2675e-03, 1.5612e-03, 8.0397e-01, 3.1568e-02,
         1.4817e-02, 9.3121e-02]], device='cuda:0')


In [1754]:
#classify
pred = classify(pred_prob,opt)
pred

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [1755]:
tags  = mlb.inverse_transform(pred)[0]
tags

('movie.starring.actor',)

In [1756]:
def predict_tags(text):
  
  tokens = cleaner(text)
  
  tokens = np.array(tokens).reshape(-1,len(tokens))
  
  seq = convert2seq(tokens)
  
  with torch.no_grad():
    if torch.cuda.is_available():
      seq = seq.cuda()

  pred_prob= model(seq)
  pred = classify(pred_prob,opt)
  
  tags  = mlb.inverse_transform(pred)[0]
  
  return tags

In [None]:
import csv
output = []
outputDict = {}
pd_input = pd.read_csv('test_data.csv')
print(len(pd_input))
for i in pd_input['utterances']:
  
  text=i
  tags = predict_tags(text)
  newTags = ''.join(tags)
  output.append(newTags)

csv_file =  open("outputFileRNN.csv", 'w')
csv_writer = csv.writer(csv_file, delimiter=",")
for i in range(len(output)) :
  print(output[i])
  csv_writer.writerow([i, output[i]])


# 6. Model Building for LSTM

In [1758]:
sample_embedding.shape

torch.Size([1, 100, 50])

In [1759]:
#define an LSTM
lstm_layer = LSTM(input_size=50, hidden_size=128, batch_first=True)

In [1760]:
#pass the input to LSTM
hidden_states, (last_hidden_state,last_cell_state) = lstm_layer(sample_embedding)

In [1761]:
#Hidden state of every timestep (Batch, seq_len, no. of hidden neurons)
hidden_states.shape

torch.Size([1, 100, 128])

In [1762]:
#output shape of last hidden timestep
last_hidden_state.shape

torch.Size([1, 1, 128])

In [1763]:
#output shape of last cell state
last_cell_state.shape

torch.Size([1, 1, 128])

In [1764]:
#reshaping the hidden states
reshaped = hidden_states.reshape(hidden_states.size(0),-1)
reshaped.shape

torch.Size([1, 12800])

In [1765]:
# Define Model Architecture

# Input
# Embedding(embedding_dim=100)
# LSTM(128)
# Linear(128, 'relu')
# Linear(10, 'sigmoid')

class Net(nn.Module):
    
    #Constructor
    def __init__(self):

        #Constructor
        super(Net, self).__init__()   
  
        #rnn block
        self.lstm_layer = Sequential(
            
            #embedding layer
            Embedding(num_embeddings=len(TEXT.vocab), embedding_dim=50),
        
            #lstm layer
            LSTM(input_size=50, hidden_size=128, batch_first=True)
          
            )

        #dense block
        self.dense_layer = Sequential(
            
            Linear(12800,128),
            LeakyReLU(),
            #ReLU(),

            Linear(128,20),
            
            Sigmoid()

        )
    
    #forward pass
    def forward(self, x):
        
        #rnn layer
        hidden_states, (last_hidden_state,last_cell_state) = self.lstm_layer(x)

        #flattening
        hidden_states = hidden_states.reshape(hidden_states.size(0),-1)
        
        #dense layer
        outputs=self.dense_layer(hidden_states)
        
        return outputs

In [1766]:
#define the model
model = Net()

In [1767]:
#layers of the model
model

Net(
  (lstm_layer): Sequential(
    (0): Embedding(343, 50)
    (1): LSTM(50, 128, batch_first=True)
  )
  (dense_layer): Sequential(
    (0): Linear(in_features=12800, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=128, out_features=20, bias=True)
    (3): Sigmoid()
  )
)

In [1768]:
#pass an text to the model to understand the output

#deactivates autograd
with torch.no_grad():
  pred = model(X_train[:1])
  print(pred)

tensor([[0.4809, 0.5019, 0.5120, 0.4835, 0.5182, 0.5298, 0.4877, 0.4745, 0.4796,
         0.5127, 0.4854, 0.4647, 0.5099, 0.4949, 0.5131, 0.4916, 0.4890, 0.5320,
         0.5086, 0.5067]])


In [1769]:
#define optimizer and loss
optimizer = torch.optim.Adamax(model.parameters())
criterion = BCELoss()

# checking if GPU is available
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

In [1770]:
N_EPOCHS = 300
batch_size = 32

# intialization
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss   = train(X_train, y_train, batch_size)
    
    #evaluate the model
    valid_loss   = evaluate(X_valid, y_valid, batch_size)

    print('\nEpoch :',epoch,
          'Training loss:',round(train_loss,4),
          '\tValidation loss:',round(valid_loss,4))

    #save the best model 
    if best_valid_loss >= valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt') 
        print("\n----------------------------------------------------Saved best model------------------------------------------------------------------")   




Epoch : 0 Training loss: 0.1926 	Validation loss: 0.1746

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 1 Training loss: 0.1693 	Validation loss: 0.1623

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 2 Training loss: 0.1506 	Validation loss: 0.1457

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 3 Training loss: 0.1328 	Validation loss: 0.1314

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 4 Training loss: 0.1148 	Validation loss: 0.1182

----------------------------------------------------Saved best model------------------------------------------------------------------

Epoch : 5 Training loss: 0.09

# 7. Model Evaluation for LSTM

In [1771]:
#load weights of best model
path='saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [1772]:
#predict probabilities
y_pred_prob = predict(X_valid, batch_size)

In [1773]:
y_pred_prob[0]

array([1.6386135e-03, 4.6954736e-01, 3.3682425e-02, 4.8063551e-03,
       4.0270074e-04, 3.1126809e-05, 3.4494396e-06, 1.2976742e-02,
       8.3559287e-05, 1.0617897e-02, 1.4325685e-04, 6.1897421e-03,
       9.3468424e-04, 6.7847632e-03, 2.9506878e-04, 9.3810930e-04,
       4.2290720e-01, 1.4824462e-01, 2.7378689e-02, 2.7846325e-02],
      dtype=float32)

In [1774]:
score=[]

for thresh in threshold:
    
    #classes for each threshold
    y_pred = classify(y_pred_prob, thresh) 

    #convert to 1d array
    y_pred_1d    =  y_pred.ravel()
    y_true_1d    =  y_true.ravel()
 
    score.append(metrics.f1_score(y_true_1d, y_pred_1d))

In [1775]:
# find the optimal threshold
opt = threshold[score.index(max(score))]
print(opt)  

0.48


In [1776]:
#predictions for optimal threshold
y_pred = classify(y_pred_prob, opt)

In [1777]:
#converting to 1D
y_pred_1d = y_pred.ravel()

#Classification report
print(metrics.classification_report(y_true_1d, y_pred_1d))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      8322
         1.0       0.81      0.75      0.78       438

    accuracy                           0.98      8760
   macro avg       0.90      0.87      0.88      8760
weighted avg       0.98      0.98      0.98      8760



In [1778]:
y_pred_label = mlb.inverse_transform(np.array(y_pred))

In [1779]:
df = pd.DataFrame({'utterances':utterances,'actual':y_true_label,'predictions':y_pred_label})

In [1780]:
df.head()

Unnamed: 0,utterances,actual,predictions
0,played oranges,"(movie.starring.actor,)",()
1,display films arts,"(movie.production_companies,)",()
2,russell crowe movies,"(movie.starring.actor,)",()
3,director avatar,"(movie.directed_by,)","(movie.directed_by,)"
4,show r rated movies,"(movie.rating,)","(movie.rating,)"


In [1781]:
import csv
output = []
outputDict = {}
pd_input = pd.read_csv('test_data.csv')
for i in pd_input['utterances']:
  tags = predict_tags(i)
  newTags = ''.join(tags)
  output.append(newTags)

csv_file =  open("outputFileLSTM.csv", 'w')
csv_writer = csv.writer(csv_file, delimiter=",")
for i in range(len(output)) :
  print(output[i])
  print(len(output[i]))
  csv_writer.writerow([i, output[i]])


none
4
none
4
movie.starring.actor
20
none
4
none
4
movie.starring.actor
20

0
movie.starring.actor
20
movie.starring.actor
20
none
4
none
4

0
movie.starring.actor
20
movie.starring.actor
20
movie.starring.actor
20
movie.starring.actor
20
movie.starring.actor
20
movie.starring.actor
20
movie.starring.actor
20
none
4
movie.starring.actor
20
movie.starring.actor
20
movie.starring.actor
20
none
4
movie.genre
11
movie.country_movie.language
28
none
4
none
4

0
movie.starring.actor
20
movie.starring.actor
20
movie.starring.actor
20
movie.starring.actor
20
none
4
movie.starring.actor
20
movie.estimated_budget
22
movie.estimated_budget
22
movie.estimated_budget
22
movie.estimated_budget
22
movie.estimated_budget
22
movie.estimated_budget
22
movie.estimated_budget
22
movie.estimated_budget
22
movie.gross_revenue
19
movie.estimated_budget
22
movie.directed_by
17
movie.estimated_budget
22
movie.estimated_budget
22
movie.estimated_budget
22
movie.estimated_budget
22
movie.estimated_budget
22
mov