In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

# let's get an idea of word frequency
from collections import Counter

# tool for text
import spacy
import re

from pandas.testing import assert_frame_equal

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('../input/AI4Code')

In [2]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

In [3]:
# list of all the paths for all the training data
paths_train = list((data_dir / 'train').glob('*.json'))

# a list of all the notebook ids
nb_ids = [str(path).split('/')[-1].split('.')[0] for path in paths_train]

# create a df of the path and the notebook_id
training_dict = {'path': paths_train, 'nb_id': nb_ids}
training_paths_df = pd.DataFrame.from_dict(training_dict)

In [4]:
# getting the correct order of the cells
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()

In [5]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

In [6]:
class PythonDataset(Dataset):
    def __init__(self, df, df_orders):
        self.df = df
        self.df_orders = df_orders

    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = training_paths_df.iloc[idx]
        # retriveing a singel file and converting it to a dataframe
        disorganized_df = read_notebook(row['path'])
        cell_order = df_orders[row['nb_id']]
        
        # rank each of the cell in that specific notebook
        cell_ranks = get_ranks(cell_order, list(disorganized_df.index))
        # insert the ranks back into the dataframe
        disorganized_df.insert(0, 'rank', cell_ranks)

        organized_df = disorganized_df.copy()[['rank', 'cell_type', 'source']]
        organized_df['ranked_cleaned'] = np.where(
                                            organized_df['cell_type'] == 'code',
                                            organized_df.groupby(['cell_type']).cumcount().to_numpy() + 1,
                                            0,)
        return organized_df

## Split Data

In [7]:
whole_train = training_paths_df.iloc[:1000, :].copy()

In [8]:
whole_train_ds = PythonDataset(whole_train, df_orders)

In [9]:
nlp= spacy.load('en_core_web_sm', disable = ['ner', 'parser'])
def clean_text(df, nlp, column):
    rows = []
    for idx in range(len(df)):
        row = df.iloc[idx].copy()

        # first we remove numeric characters and lowercase everything
        cleaned_review = re.sub("[^A-Za-z']+", ' ', row[column].replace('<br />', ' ')).lower()
        # we let spaCy tokenize the text for us
        tokenized_review = nlp(cleaned_review)
        cleaned_tokenized = [token.lemma_ for token in tokenized_review]
        cleaned_tokenized = [token for token in cleaned_tokenized if len(token)>1]

        if len(cleaned_tokenized) >= 1:
            row['cleaned'] = ' '.join(cleaned_tokenized)

        rows.append(row)
    data = pd.DataFrame(rows)
    data = data.reset_index()
    idx_nans = np.where(data['cleaned'].isna())[0]
    data = data.drop(idx_nans)
    return data

In [10]:
def find_max_length(data):
    max_length = 0
    for i in range(len(data)):
        row = data.iloc[i]['cleaned']
        length = len(row.split())

        if length > max_length:
            max_length = length
    return max_length

In [11]:
def preprocess_whole_df(df, df_ds):
    final_df = None

    for i in tqdm(range(len(df))):
        temp_df = df_ds[i]
        cleaned_df = clean_text(temp_df, nlp, 'source')

        if i == 0:
            final_df = cleaned_df
        final_df = pd.concat([final_df, cleaned_df])

    return final_df

In [12]:
def make_word_dict(cleaned_df):
    sentences = [string.split(' ') for string in list(cleaned_df['cleaned'])]
    word_freq = Counter([token for string in sentences for token in string]).most_common()
    word_freq_dict = dict(word_freq)
    
    min_freq = 5
    word_dict = {}

    # sending all the unknowns to 0
    i = 1
    for word in word_freq_dict:
        if word_freq_dict[word] > min_freq:
            word_dict[word] = i
            i += 1
        else:
            word_dict[word] = 0
    
    # dictionary length        
    dict_length = max(word_dict.values()) + 1
    
    return dict_length, word_dict

In [13]:
final_train = preprocess_whole_df(whole_train, whole_train_ds)
dict_length, word_dict = make_word_dict(final_train)
max_length = find_max_length(final_train)

In [14]:
class PythonDataset2(Dataset):
    def __init__(self, df, word_dict, max_length):
        self.df = df
        self.word_dict = word_dict
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
#         print(row)
        content = row['cleaned'].split(' ')
        
        # find the idx that is asscoiated with the particular word
        content_idxs = [self.word_dict[word] for word in content]
        # front pad the sentence 
        cleaned_content_arr = np.array(content_idxs)
        zeros_arr = np.zeros(max_length - len(content))
        padded_arr = np.concatenate([zeros_arr, cleaned_content_arr])
        
        x = torch.LongTensor(padded_arr) 
        x2 = torch.tensor(row['ranked_cleaned']).float()
        y = torch.tensor(row['rank']).float()
        
        # embedding likes long tensors
        return x, x2, y

In [15]:
N = int(final_train.shape[0] * 0.8)
train = final_train.iloc[:N, :]
val = final_train.iloc[N:, :]

In [16]:
train.shape, val.shape

In [17]:
train_ds = PythonDataset2(train, word_dict, max_length)
val_ds = PythonDataset2(val, word_dict, max_length)

train_dl = DataLoader(train_ds, batch_size=10, shuffle=True)
val_dl = DataLoader(train_ds, batch_size=10, shuffle=False)

In [18]:
import torch.nn.functional as F


In [19]:
# Vanilla RNN using nn.RNN
class RNN(nn.Module):
    def __init__(self, dict_length, max_length, emb_size, hidden_size, output_size):
        super(RNN, self).__init__()
        # embed the words
        self.emb = nn.Embedding(dict_length, emb_size, padding_idx=0)  
        
        self.linear2 = nn.Linear((max_length*emb_size)+1, output_size)

    def forward(self, x, x2):
        
        # RNN layer outputs a tuple, the output and the final hidden state
        # taking the final hidden state as output
#         print('-------', x.size(), max_length)
        x = self.emb(x)
        x = x.view(x.shape[0],x.shape[1]*x.shape[-1])

        x2 = x2.unsqueeze(1)
        X = torch.cat((x,x2), 1)
        out = self.linear2(X)
        return out.squeeze()

In [20]:
model = RNN(dict_length, max_length, 10,5, 1)

In [21]:
def one_pass(model, dataloader, optimizer,lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, x2, y in dataloader:
        
        y_pred = model(x, x2)
#         y_pred = y_pred.detach().numpy()
#         y = y.detach().numpy()
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

In [22]:
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [23]:
lossFun = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)

    train_loss = one_pass(model, train_dl, optimizer, lossFun, backwards=True, print_loss=False)

    val_loss = one_pass(model, val_dl, optimizer, lossFun, backwards=False, print_loss=False)
    print('Train Loss-%.4f Validation Loss-%.4f '%
          (train_loss, val_loss))
#     print('Val Loss-%.4f Val Accuracy-%.4f Val Precision-%.4f Val Recall-%.4f' %
#           (val_loss, val_acc, val_precision, val_recall))

In [24]:
# Vanilla RNN using nn.RNN
class RNN_Dropout(nn.Module):
    def __init__(self, dict_length, max_length, emb_size, hidden_size, output_size):
        super(RNN_Dropout, self).__init__()
        # embed the words
        self.emb = nn.Embedding(dict_length, emb_size, padding_idx=0)  
        
        self.linear2 = nn.Linear((max_length*emb_size)+1, output_size)
        self.dropout = nn.Dropout(p=.25)


    def forward(self, x, x2):
        
        # RNN layer outputs a tuple, the output and the final hidden state
        # taking the final hidden state as output
#         print('-------', x.size(), max_length)
        x = self.emb(x)
        x = x.view(x.shape[0],x.shape[1]*x.shape[-1])

        x2 = x2.unsqueeze(1)
        X = torch.cat((x,x2), 1)
        out = self.linear2(X)
        out = self.dropout(out)
        return out.squeeze()

In [25]:
model = RNN_Dropout(dict_length, max_length, 10,5, 1)

In [26]:
lossFun = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)

    train_loss = one_pass(model, train_dl, optimizer, lossFun, backwards=True, print_loss=False)

    val_loss = one_pass(model, val_dl, optimizer, lossFun, backwards=False, print_loss=False)
    print('Train Loss-%.4f Validation Loss-%.4f '%
          (train_loss, val_loss))
#     print('Val Loss-%.4f Val Accuracy-%.4f Val Precision-%.4f Val Recall-%.4f' %
#           (val_loss, val_acc, val_precision, val_recall))

In [27]:
model = RNN(dict_length, max_length, 20,5, 1)

In [28]:
lossFun = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)

    train_loss = one_pass(model, train_dl, optimizer, lossFun, backwards=True, print_loss=False)

    val_loss = one_pass(model, val_dl, optimizer, lossFun, backwards=False, print_loss=False)
    print('Train Loss-%.4f Validation Loss-%.4f '%
          (train_loss, val_loss))
#     print('Val Loss-%.4f Val Accuracy-%.4f Val Precision-%.4f Val Recall-%.4f' %
#           (val_loss, val_acc, val_precision, val_recall))