In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

# let's get an idea of word frequency
from collections import Counter

# tool for text
import spacy
import re

from pandas.testing import assert_frame_equal

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('../input/AI4Code')

In [2]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

In [3]:
# list of all the paths for all the training data
paths_train = list((data_dir / 'train').glob('*.json'))

# a list of all the notebook ids
nb_ids = [str(path).split('/')[-1].split('.')[0] for path in paths_train]

# create a df of the path and the notebook_id
training_dict = {'path': paths_train, 'nb_id': nb_ids}
training_paths_df = pd.DataFrame.from_dict(training_dict)

In [4]:
# getting the correct order of the cells
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()

In [5]:
idx = 0
row = training_paths_df.iloc[idx]
# retriveing a singel file and converting it to a dataframe
disorganized_df = read_notebook(row['path'])
cell_order = df_orders[row['nb_id']]

In [6]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

In [7]:
# rank each of the cell in that specific notebook
cell_ranks = get_ranks(cell_order, list(disorganized_df.index))
# insert the ranks back into the dataframe
disorganized_df.insert(0, 'rank', cell_ranks)

organized_df = disorganized_df.copy()[['rank', 'cell_type', 'source']]

In [8]:
organized_df = disorganized_df.copy()[['rank', 'cell_type', 'source']]
organized_df['rank_cleaned'] = [0] * len(organized_df)

In [9]:
organized_df

Unnamed: 0_level_0,rank,cell_type,source,rank_cleaned
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8395ab7c,0,code,import numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport pandas as pd\nimport uuid\nimport...,0
ebc844d6,1,code,df_train = pd.read_csv('../input/tensorflow-great-barrier-reef/train.csv')\ndf_train,0
49251f17,2,code,"def bbox_inv_iou(boxA, boxB):\n """"""Copied from: https://gist.github.com/meyerjo/dd3533edc97c81258898f60d8978eddc\...",0
3a6623e3,3,code,test_sequence_id = np.unique(df_train.sequence)[2]\nprint(test_sequence_id)\ntest_sequence_df = df_train[df_train.se...,0
24e09d1a,4,code,"seq_df_with_cots_ids, stats = find_unique_cots(\n test_sequence_df,\n dist_func=lambda boxA, boxB: bbox_center...",0
93e1713d,5,code,"best_idx, best_row, most_cots = None, None, 0\nfor idx, row in seq_df_with_cots_ids.iterrows():\n raw_annots = as...",0
d17c2682,6,code,"def load_image(video_id, video_frame, image_dir):\n img_path = f'{image_dir}/video_{video_id}/{video_frame}.jpg'\...",0
9fa9f6ca,7,code,"from tqdm.auto import tqdm\nimport subprocess\n\ndef make_video(df, video_name, image_dir):\n # partly borrowed f...",0
714e15e7,8,code,"from IPython.display import Video, display\nVideo('test_video.mp4')",0
4e5f080f,10,code,additional_columns_by_seqid = []\n\nfor sequence_id in np.unique(df_train.sequence):\n sequence_df = df_train[df_...,0


In [10]:
organized_df['ranked_cleaned'] = np.where(
organized_df['cell_type'] == 'code',
organized_df.groupby(['cell_type']).cumcount().to_numpy() + 1,
0,)

In [11]:
organized_df

Unnamed: 0_level_0,rank,cell_type,source,rank_cleaned,ranked_cleaned
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8395ab7c,0,code,import numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport pandas as pd\nimport uuid\nimport...,0,1
ebc844d6,1,code,df_train = pd.read_csv('../input/tensorflow-great-barrier-reef/train.csv')\ndf_train,0,2
49251f17,2,code,"def bbox_inv_iou(boxA, boxB):\n """"""Copied from: https://gist.github.com/meyerjo/dd3533edc97c81258898f60d8978eddc\...",0,3
3a6623e3,3,code,test_sequence_id = np.unique(df_train.sequence)[2]\nprint(test_sequence_id)\ntest_sequence_df = df_train[df_train.se...,0,4
24e09d1a,4,code,"seq_df_with_cots_ids, stats = find_unique_cots(\n test_sequence_df,\n dist_func=lambda boxA, boxB: bbox_center...",0,5
93e1713d,5,code,"best_idx, best_row, most_cots = None, None, 0\nfor idx, row in seq_df_with_cots_ids.iterrows():\n raw_annots = as...",0,6
d17c2682,6,code,"def load_image(video_id, video_frame, image_dir):\n img_path = f'{image_dir}/video_{video_id}/{video_frame}.jpg'\...",0,7
9fa9f6ca,7,code,"from tqdm.auto import tqdm\nimport subprocess\n\ndef make_video(df, video_name, image_dir):\n # partly borrowed f...",0,8
714e15e7,8,code,"from IPython.display import Video, display\nVideo('test_video.mp4')",0,9
4e5f080f,10,code,additional_columns_by_seqid = []\n\nfor sequence_id in np.unique(df_train.sequence):\n sequence_df = df_train[df_...,0,10


In [12]:
class PythonDataset(Dataset):
    def __init__(self, df, df_orders):
        self.df = df
        self.df_orders = df_orders

    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = training_paths_df.iloc[idx]
        # retriveing a singel file and converting it to a dataframe
        disorganized_df = read_notebook(row['path'])
        cell_order = df_orders[row['nb_id']]
        
        # rank each of the cell in that specific notebook
        cell_ranks = get_ranks(cell_order, list(disorganized_df.index))
        # insert the ranks back into the dataframe
        disorganized_df.insert(0, 'rank', cell_ranks)

        organized_df = disorganized_df.copy()[['rank', 'cell_type', 'source']]
        organized_df['ranked_cleaned'] = np.where(
                                            organized_df['cell_type'] == 'code',
                                            organized_df.groupby(['cell_type']).cumcount().to_numpy() + 1,
                                            0,)
        return organized_df

## Split Data

In [13]:
whole_train = training_paths_df.iloc[:50, :].copy()

In [14]:
whole_train_ds = PythonDataset(whole_train, df_orders)

In [15]:
nlp= spacy.load('en_core_web_sm', disable = ['ner', 'parser'])
def clean_text(df, nlp, column):
    rows = []
    for idx in range(len(df)):
        row = df.iloc[idx].copy()

        # first we remove numeric characters and lowercase everything
        cleaned_review = re.sub("[^A-Za-z']+", ' ', row[column].replace('<br />', ' ')).lower()
        # we let spaCy tokenize the text for us
        tokenized_review = nlp(cleaned_review)
        cleaned_tokenized = [token.lemma_ for token in tokenized_review]
        cleaned_tokenized = [token for token in cleaned_tokenized if len(token)>1]

        if len(cleaned_tokenized) >= 1:
            row['cleaned'] = ' '.join(cleaned_tokenized)

        rows.append(row)
    data = pd.DataFrame(rows)
    data = data.reset_index()
    idx_nans = np.where(data['cleaned'].isna())[0]
    data = data.drop(idx_nans)
    return data

In [16]:
def find_max_length(data):
    max_length = 0
    for i in range(len(data)):
        row = data.iloc[i]['cleaned']
        length = len(row.split())

        if length > max_length:
            max_length = length
    return max_length

In [17]:
def preprocess_whole_df(df, df_ds):
    final_df = None

    for i in tqdm(range(len(df))):
        temp_df = df_ds[i]
        cleaned_df = clean_text(temp_df, nlp, 'source')

        if i == 0:
            final_df = cleaned_df
        final_df = pd.concat([final_df, cleaned_df])

    return final_df

In [18]:
def make_word_dict(cleaned_df):
    sentences = [string.split(' ') for string in list(cleaned_df['cleaned'])]
    word_freq = Counter([token for string in sentences for token in string]).most_common()
    word_freq_dict = dict(word_freq)
    
    min_freq = 5
    word_dict = {}

    # sending all the unknowns to 0
    i = 1
    for word in word_freq_dict:
        if word_freq_dict[word] > min_freq:
            word_dict[word] = i
            i += 1
        else:
            word_dict[word] = 0
    
    # dictionary length        
    dict_length = max(word_dict.values()) + 1
    
    return dict_length, word_dict

In [19]:
final_train = preprocess_whole_df(whole_train, whole_train_ds)
dict_length, word_dict = make_word_dict(final_train)
max_length = find_max_length(final_train)

100%|██████████| 50/50 [00:12<00:00,  4.09it/s]


In [20]:
class PythonDataset2(Dataset):
    def __init__(self, df, word_dict, max_length):
        self.df = df
        self.word_dict = word_dict
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
#         print(row)
        content = row['cleaned'].split(' ')
        
        # find the idx that is asscoiated with the particular word
        content_idxs = [self.word_dict[word] for word in content]
        # front pad the sentence 
        cleaned_content_arr = np.array(content_idxs)
        zeros_arr = np.zeros(max_length - len(content))
        padded_arr = np.concatenate([zeros_arr, cleaned_content_arr])
        
        x = torch.LongTensor(padded_arr) 
        x2 = torch.tensor(row['ranked_cleaned']).float()
        y = torch.tensor(row['rank']).float()
        
        # embedding likes long tensors
        return x, x2, y

In [21]:
N = int(final_train.shape[0] * 0.8)
train = final_train.iloc[:N, :]
val = final_train.iloc[N:, :]

In [22]:
train.shape, val.shape

((1840, 6), (460, 6))

In [23]:
train_ds = PythonDataset2(train, word_dict, max_length)
val_ds = PythonDataset2(val, word_dict, max_length)

train_dl = DataLoader(train_ds, batch_size=10, shuffle=True)
val_dl = DataLoader(train_ds, batch_size=10, shuffle=False)

In [24]:
import torch.nn.functional as F


In [25]:
# Vanilla RNN using nn.RNN
class RNN(nn.Module):
    def __init__(self, dict_length, max_length, emb_size, hidden_size, output_size):
        super(RNN, self).__init__()
        # embed the words
        self.emb = nn.Embedding(dict_length, emb_size, padding_idx=0)
        # pass through an LSTM
        # RNN doesn't care about length of sequence
        # RNN does care about the size of the word embedding
        # hidden size dictates dimension of output of RN
#         self.linear = nn.Linear(emb_size, hidden_size)   
        
        self.linear2 = nn.Linear((max_length*emb_size)+1, output_size)

    def forward(self, x, x2):
        
        # RNN layer outputs a tuple, the output and the final hidden state
        # taking the final hidden state as output
#         print('-------', x.size(), max_length)
        x = self.emb(x)
        x = x.view(x.shape[0],x.shape[1]*x.shape[-1])

        x2 = x2.unsqueeze(1)
#         print(x.size(), x2.size())
        X = torch.cat((x,x2), 1)
#         print('-------', X.size)
#         out = self.linear(X)
#         out = F.relu(out)
        out = self.linear2(X)
#         X = F.relu(X)
        return out.squeeze()

In [26]:
model = RNN(dict_length, max_length, 7,5, 1)

In [27]:
def one_pass(model, dataloader, optimizer,lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, x2, y in tqdm(dataloader):
        
        y_pred = model(x, x2)
#         y_pred = y_pred.detach().numpy()
#         y = y.detach().numpy()
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

In [28]:
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [29]:
lossFun = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 1

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)

    train_loss = one_pass(model, train_dl, optimizer, lossFun, backwards=True, print_loss=False)

    val_loss = one_pass(model, val_dl, optimizer, lossFun, backwards=False, print_loss=False)
    print(train_loss, val_loss)
    print('Train Loss-%.4f Validation Loss-%.4f '%
          (train_loss, val_loss))
#     print('Val Loss-%.4f Val Accuracy-%.4f Val Precision-%.4f Val Recall-%.4f' %
#           (val_loss, val_acc, val_precision, val_recall))

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:  0



  0%|          | 0/184 [00:00<?, ?it/s][A
  6%|▌         | 11/184 [00:00<00:01, 109.37it/s][A
 20%|██        | 37/184 [00:00<00:00, 194.71it/s][A
 34%|███▍      | 63/184 [00:00<00:00, 223.12it/s][A
 48%|████▊     | 89/184 [00:00<00:00, 234.81it/s][A
 62%|██████▎   | 115/184 [00:00<00:00, 243.40it/s][A
 76%|███████▌  | 140/184 [00:00<00:00, 241.38it/s][A
100%|██████████| 184/184 [00:00<00:00, 232.98it/s]

  0%|          | 0/184 [00:00<?, ?it/s][A
 16%|█▌        | 29/184 [00:00<00:00, 284.20it/s][A
 32%|███▏      | 59/184 [00:00<00:00, 292.16it/s][A
 50%|█████     | 92/184 [00:00<00:00, 306.00it/s][A
 68%|██████▊   | 125/184 [00:00<00:00, 313.47it/s][A
100%|██████████| 184/184 [00:00<00:00, 310.83it/s]
100%|██████████| 1/1 [00:01<00:00,  1.39s/it]

1478.8955981835074 760.0424811710482
Train Loss-1478.8956 Validation Loss-760.0425 





In [30]:
paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
test_df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()

Test NBs: 100%|██████████| 4/4 [00:00<00:00, 118.13it/s]


In [31]:
test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

In [32]:
test = clean_text(test_df, nlp, 'source')
test_dict_length, test_word_dict = make_word_dict(test)
test_max_length = find_max_length(test)

In [33]:
class PythonDataset3(Dataset):
    def __init__(self, df, word_dict, max_length):
        self.df = df
        self.word_dict = word_dict
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
#         print(row)
        content = row['cleaned'].split(' ')
        
        # find the idx that is asscoiated with the particular word
        content_idxs = [self.word_dict[word] for word in content]
        # front pad the sentence 
        cleaned_content_arr = np.array(content_idxs)
        zeros_arr = np.zeros(max_length - len(content))
        padded_arr = np.concatenate([zeros_arr, cleaned_content_arr])
        
        x = torch.LongTensor(padded_arr) 
        x2 = torch.tensor(row['rank']).float()
        
        # embedding likes long tensors
        return x, x2

In [34]:
test_ds = PythonDataset3(test, test_word_dict, test_max_length)

test_dl = DataLoader(test_ds, batch_size=10, shuffle=True)

In [35]:

model.eval()

pred_lst = list()
for x, x2 in tqdm(test_dl):

    y_pred = model(x, x2)
    y_pred = y_pred.detach().numpy()
    pred_lst.append(y_pred)

100%|██████████| 9/9 [00:00<00:00, 312.76it/s]


In [36]:
test_df['pred_val'] = np.concatenate( pred_lst, axis=0)


In [37]:
y_test = test_df.loc[test_df["cell_type"] == "markdown", "pred_val"] 

In [38]:
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

Unnamed: 0,id,cell_order
0,0009d135ece78d,ddfd239c f9893819 c6cd22db ba55e576 1372ae9b 39e937ec 90ed07ab e25aa9bd 7f388a41 0a226b6a 2843a25a 8cb8d28a 06dbf8cf
1,0010483c12ba9b,54c7cab3 fe66203e 7844d5f8 5ce8863c 4a0777c4 4703bb6d 4a32c095 865ad516 02a0be6d 7f270e34
2,0010a919d60e4f,8679f842 aafc3d23 4ae17669 80e077ec 8ce62db4 b190ebb4 bac960d3 f9e38e5a ed415c3c ea06b4d0 322850af 50bc28b3 c069ed33...
3,0028856e09c5b7,012c9d02 d22526d1 3ae7ece3 eb293dfc


In [39]:
sub_df.to_csv("submission.csv", index=False)