## Generate Vocabulary

In [1]:
import pickle

class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    SOS_token = 1   # Start-of-sentence token
    EOS_token = 2   # End-of-sentence token
    OOV_Token = 3   # Out Of Vocabulary token
    SPACE = 4 # Vocabulary token for SPACE
    def __init__(self, name):  
        self.name = name
        self.token2index = {}
        self.token2count = {}
        self.index2token = {Vocabulary.PAD_token: "PAD", Vocabulary.SOS_token: "SOS", Vocabulary.EOS_token: "EOS", 
                            Vocabulary.OOV_Token:'OOV', Vocabulary.SPACE : "SPACE"}
        self._num_constants = len(self.index2token) 
        
        self.num_tokens = len(self.index2token)
        self.num_sentences = 0
        self.longest_sentence = 0
        self.frequent = None

    def add_token(self, token):
        if token not in self.token2index:
            # First entry of word into vocabulary
            self.token2index[token] = self.num_tokens
            self.token2count[token] = 1
            self.index2token[self.num_tokens] = token
            self.num_tokens += 1
        else:
            # Word exists; increase word count
            self.token2count[token] += 1

    def add_token_list(self, token_list):
        sentence_len = 0
        for token in token_list:
            sentence_len += 1
            self.add_token(token)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_token(self, index):
        if self.frequent:
            return self.frequent['index2token'].get(index, None)
        else:
            return self.index2token.get(index, None)


    def to_index(self, word):
        if self.frequent:
            token =  self.frequent['token2index'].get(word, None)
            return token if token else self.OOV_Token
        else:
            token =  self.token2index.get(word, None)
            return token if token else self.OOV_Token

    def token_count(self, token):
        return self.token2count.get(token, None)

    def truncate_vocabulary(self, frequency=5):
        sorted_vocab = { k  : v for k, v in sorted(self.token2count.items(), key= lambda item : item[1], reverse=True)}
        token2count =  {k: v for k, v in sorted_vocab.items() if v > frequency}
        token2index =  {}#{k: self.token2index[k]  for k, v in token2count.items() }
        index2token = {}
        i = self._num_constants
        for k, v in token2count.items():
            token2index[k] = i
            index2token[i] = k
            i += 1

        #index2token =  {v: k for k, v in token2index.items() }
        
        index2token[self.PAD_token] = "PAD"
        index2token[self.SOS_token] = "SOS"
        index2token[self.EOS_token] = "EOS"
        index2token[self.OOV_Token] = "OOV"
        index2token[self.SPACE] = "SPACE"

        self.frequent = {'token2index':token2index, 'index2token':index2token}
    
    def len_frequent_vocab(self):
        return len(self.frequent['index2token']) if self.frequent else None

    def __len__(self):
        return len(self.index2token)

    def save_data(self, file_path):
        data = {'frequent': self.frequent,
                'name': self.name,
                'token2index': self.token2index,
                'token2count': self.token2count,
                'index2token': self.index2token,
                'num_constants': self._num_constants,
                'num_tokens': self.num_tokens,
                'num_sentences': self.num_sentences,
                'longest_sentence': self.longest_sentence
            
        }
        file = open(file_path, 'wb')
        pickle.dump(data, file)
        file.close()

    def load_data(self, file_path):
        file =open(file_path, 'rb')
        data = pickle.load(file)
        file.close()
        self.frequent = data['frequent']
        self.name = data['name']
        self.token2index = data['token2index']
        self.token2count = data['token2count']
        self.index2token = data['index2token']
        self._num_constants = data['num_constants']
        
        self.num_tokens = data['num_tokens']
        self.num_sentences = data['num_sentences']
        self.longest_sentence = data['longest_sentence']


## Setup


In [2]:
import ast
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras





# Connect Drive

In [3]:

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
# Change this path to your google drive (project directory) 
%cd /content/gdrive/MyDrive/nlp

/content/gdrive/MyDrive/nlp


# Load Data

In [5]:
file_path = 'train.csv'

df  = pd.read_csv(file_path)
df.head(2)

Unnamed: 0.1,Unnamed: 0,sourceText,targetText,sourceLineText,targetLineText,lineNums_Text,sourceTokens,targetTokens,sourceLineTokens,targetLineTokens
0,0,#include<stdio.h>\n\nint check_prime(int num)\...,#include<stdio.h>\n\nint check_prime(int num)\...,if ( num % i != = 0 ) \n,if ( num % i == 0 ) \n,10,"[['#include', '<stdio.h>'], [], ['int', 'check...","[['#include', '<stdio.h>'], [], ['int', 'check...","['if', '(', 'num', '%', 'i', '!=', '=', '0', ')']","['if', '(', 'num', '%', 'i', '==', '0', ')']"
1,1,"#include <stdio.h>\nstruct point{\n int x,y...","#include <stdio.h>\nstruct point{\n int x,y...",\n,} \n,39,"[['#include', ' <stdio.h>'], ['struct', 'point...","[['#include', ' <stdio.h>'], ['struct', 'point...",[],['}']


In [6]:
file_path = 'valid.csv'

df1  = pd.read_csv(file_path)
df1.head(2)

Unnamed: 0.1,Unnamed: 0,sourceText,targetText,sourceLineText,targetLineText,lineNums_Text,sourceTokens,targetTokens,sourceLineTokens,targetLineTokens
0,14643,#include <stdio.h>\n#include <stdlib.h>\nint f...,#include <stdio.h>\n#include <stdlib.h>\nint f...,if ( ( factorial ( x ) >= n1 ) && ( facdtorial...,if ( ( factorial ( x ) >= n1 ) && ( factorial ...,18,"[['#include', ' <stdio.h>'], ['#include', ' <s...","[['#include', ' <stdio.h>'], ['#include', ' <s...","['if', '(', '(', 'factorial', '(', 'x', ')', '...","['if', '(', '(', 'factorial', '(', 'x', ')', '..."
1,14644,#include<stdio.h>\n#include<stdlib.h>\nint mai...,#include<stdio.h>\n#include<stdlib.h>\nint mai...,"scanf ( ""%d"" , & a [ i ] ) ; \n","scanf ( ""%d"" , & a [ 0 ] ) ; \n",9,"[['#include', '<stdio.h>'], ['#include', '<std...","[['#include', '<stdio.h>'], ['#include', '<std...","['scanf', '(', '""%d""', ',', '&', 'a', '[', 'i'...","['scanf', '(', '""%d""', ',', '&', 'a', '[', '0'..."


In [7]:
sourceLineTokens = df['sourceLineTokens']
targetLineTokens = df['targetLineTokens']

print(f'len(sourceLineTokens): {len(sourceLineTokens)} \nlen(targetLineTokens): {len(targetLineTokens)}')

len(sourceLineTokens): 14643 
len(targetLineTokens): 14643


In [8]:
sourceLineTokensVal = df1['sourceLineTokens']
targetLineTokensVal = df1['targetLineTokens']

print(f'len(sourceLineTokensVal): {len(sourceLineTokensVal)} \nlen(targetLineTokensVal): {len(targetLineTokensVal)}')

len(sourceLineTokensVal): 2584 
len(targetLineTokensVal): 2584


# Data Exploration

In [9]:
print(type(sourceLineTokens[0]))
sourceLineTokens[0] 

<class 'str'>


"['if', '(', 'num', '%', 'i', '!=', '=', '0', ')']"

In [10]:
# Convert List string to list
sourceLineTokens = [ast.literal_eval(src) for src in sourceLineTokens]
type(sourceLineTokens[0])

list

In [11]:
# Convert List string to list
sourceLineTokensVal = [ast.literal_eval(src) for src in sourceLineTokensVal]
type(sourceLineTokensVal[0])

list

In [12]:
sourceLineTokens[0]

['if', '(', 'num', '%', 'i', '!=', '=', '0', ')']

In [13]:
targetLineTokens = [ast.literal_eval(src) for src in targetLineTokens]
type(targetLineTokens[0])

list

In [14]:
targetLineTokensVal = [ast.literal_eval(src) for src in targetLineTokensVal]
type(targetLineTokensVal[0])

list

In [15]:
targetLineTokens[0]

['if', '(', 'num', '%', 'i', '==', '0', ')']

# Preprocessing

In [16]:
input_vocabulary = Vocabulary('input_vocabulary') 

In [17]:
for token_list in sourceLineTokens:
  input_vocabulary.add_token_list(token_list)


In [18]:
input_vocabulary.num_sentences

14643

In [23]:
input_vocabulary.longest_sentence

154

In [19]:
input_vocabulary.num_tokens

4440

In [None]:
input_vocabulary.token2count

# Exploring the built vocabulary to select frequency





In [None]:
sorted_vocabulary = { k : v for k, v in sorted(input_vocabulary.token2count.items(), key= lambda item : item[1], reverse=True)}
sorted_vocabulary

In [24]:
reduced_sorted_vocab  =  {k: v for k, v in sorted_vocabulary.items() if v > 30}
len(reduced_sorted_vocab)

238

In [None]:
reduced_sorted_vocab

# Trucating the size of vocabulary using Frequency

In [26]:
# Reduce size of vocabulary rather than normalization (most frequent tokens)
input_vocabulary.truncate_vocabulary(30)

In [27]:

input_vocabulary.len_frequent_vocab()

243

In [None]:
input_vocabulary.frequent

# Target vocabulary construction

In [29]:
target_vocabulary = Vocabulary('target_vocabulary')

In [30]:
for token_list in targetLineTokens:
  target_vocabulary.add_token_list(token_list)


In [31]:
len(target_vocabulary)

3852

In [32]:
# Reduce Vocabulary size/ (most frequent tokens)
target_vocabulary.truncate_vocabulary(30)

In [33]:

target_vocabulary.len_frequent_vocab()

238

# Encodings

In [34]:
print(input_vocabulary.len_frequent_vocab())
print(target_vocabulary.len_frequent_vocab())

243
238


In [35]:
# Constants, used for model training and validation as well
num_encoder_tokens = input_vocabulary.len_frequent_vocab() #243
num_decoder_tokens = target_vocabulary.len_frequent_vocab() #238
max_encoder_seq_length =  100 
max_decoder_seq_length =  100


In [36]:
# Save Vocabulary for validation purpose
input_vocabulary.save_data('input_vocabulary.pickle')
target_vocabulary.save_data('target_vocabulary.pickle')

In [36]:
print("Number of samples:", len(sourceLineTokens))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 14643
Number of unique input tokens: 243
Number of unique output tokens: 238
Max sequence length for inputs: 100
Max sequence length for outputs: 100


In [37]:
# Initializing lists 
input_texts = []
target_texts = []

input_texts_val = []
target_texts_val = []



In [38]:

for input_text, target_text in zip(sourceLineTokens, targetLineTokens):
    input_text = [input_vocabulary.to_token(Vocabulary.SOS_token)] + input_text + [input_vocabulary.to_token(Vocabulary.EOS_token)]
    target_text = [input_vocabulary.to_token(Vocabulary.SOS_token)] + target_text + [input_vocabulary.to_token(Vocabulary.EOS_token)]
    input_texts.append(input_text)
    target_texts.append(target_text)

for input_text, target_text in zip(sourceLineTokensVal, targetLineTokensVal):
    input_text = [input_vocabulary.to_token(Vocabulary.SOS_token)] + input_text + [input_vocabulary.to_token(Vocabulary.EOS_token)]
    target_text = [input_vocabulary.to_token(Vocabulary.SOS_token)] + target_text + [input_vocabulary.to_token(Vocabulary.EOS_token)]
    input_texts_val.append(input_text)
    target_texts_val.append(target_text)
    

In [39]:
print(len(input_texts))
print(len(target_texts))
print(len(input_texts_val))
print(len(target_texts_val))

14643
14643
2584
2584


In [40]:

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)


encoder_input_data_val = np.zeros(
    (len(input_texts_val), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data_val = np.zeros(
    (len(input_texts_val), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data_val = np.zeros(
    (len(input_texts_val), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

In [41]:
# Populate Train Data
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text[:max_encoder_seq_length]):
        encoder_input_data[i, t, input_vocabulary.to_index(char)] = 1.0
    
    if len(input_text) < max_encoder_seq_length:
      encoder_input_data[i, t + 1 :, input_vocabulary.PAD_token] = 1.0
    for t, char in enumerate(target_text[:max_decoder_seq_length]):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_vocabulary.to_index(char)] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_vocabulary.to_index(char)] = 1.0
    if len(target_text) < max_decoder_seq_length:
      decoder_input_data[i, t + 1 :, target_vocabulary.PAD_token] = 1.0
      decoder_target_data[i, t:, target_vocabulary.PAD_token] = 1.0


In [42]:
# Populate Val Data
for i, (input_text, target_text) in enumerate(zip(input_texts_val, target_texts_val)):
    for t, char in enumerate(input_text[:max_encoder_seq_length]):
        encoder_input_data_val[i, t, input_vocabulary.to_index(char)] = 1.0
    
    if len(input_text) < max_encoder_seq_length:
      encoder_input_data_val[i, t + 1 :, input_vocabulary.PAD_token] = 1.0
    for t, char in enumerate(target_text[:max_decoder_seq_length]):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data_val[i, t, target_vocabulary.to_index(char)] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data_val[i, t - 1, target_vocabulary.to_index(char)] = 1.0
    if len(target_text) < max_decoder_seq_length:
      decoder_input_data_val[i, t + 1 :, target_vocabulary.PAD_token] = 1.0
      decoder_target_data_val[i, t:, target_vocabulary.PAD_token] = 1.0


## Build the model


In [43]:
batch_size = 64  # Batch size for training.
epochs = 40  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.

In [44]:
# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


## Train the model


In [None]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val)
)


In [62]:
# Save model
model.save("model")



INFO:tensorflow:Assets written to: latest_model(bothside added 50epoch)/assets


INFO:tensorflow:Assets written to: latest_model(bothside added 50epoch)/assets


In [None]:
model.summary()

In [47]:
import matplotlib.pyplot as plt


In [None]:
plt.plot(history.history['loss'])

In [None]:
plt.plot(history.history['val_loss'])