# Setup #

In [4]:
import json
from pathlib import Path


import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

from sklearn.metrics import accuracy_score

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('../input/AI4Code')

In [5]:
paths_train = list((data_dir / 'train').glob('*.json'))
len(paths_train)

# Load Data #

The notebooks are stored as individiual JSON files. They've been cleaned of the usual metadata present in Jupyter notebooks, leaving only the `cell_type` and `source`. The [Data](https://www.kaggle.com/competitions/AI4Code/data) page on the competition website has the full documentation of this dataset.

We'll load the notebooks here and join them into a dataframe for easier processing. The full set of training data takes quite a while to load, so we'll just use a subset for this demonstration.

In [6]:
NUM_TRAIN = 1000


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

df

# Ordering the Cells #

In the `train_orders.csv` file we have, for notebooks in the training set, the correct ordering of cells in terms of the cell ids.

In [7]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

df_orders

The correct numeric position of a cell we will call the **rank** of the cell. We can find the ranks of the cells within a notebook by referencing the true ordering of cell ids as given in `train_orders.csv`.

In [8]:
def get_dataset(df):
    range_nbid = df.index.unique('id')
    dict_pair = {}
    for nb_id in tqdm(range_nbid):
        nb = df.loc[nb_id, :]
        cell_order = df_orders.loc[nb_id]

        def get_ranks(base, derived):
            return [base.index(d) for d in derived]

        cell_ranks = get_ranks(cell_order, list(nb.index))
        nb.insert(0, 'rank', cell_ranks)

        nb_pos = nb.copy().reset_index()
        nb_pos['cell_rank'] = np.where(nb_pos['rank']+1<=nb_pos['rank'].max(), nb_pos['rank']+1, nb_pos['rank'].max())
        nb_pos['cell_id_2'] = nb_pos['cell_id'][nb_pos['cell_rank']].values
        nb_pos = nb_pos[nb_pos['cell_type']=='markdown']

        nb_pos = nb_pos[nb_pos['cell_type']=='markdown'].reset_index()
        nb_pos['label'] = 1
        nb_pos = nb_pos[['cell_id','cell_id','label']]
        nb_pos.columns = ['md_id','cell_id','label']

        neg_ratio = 2

        all_cell_rank = nb[nb['cell_type']!='markdown']['rank'].values
        cell_ind = np.random.randint(0, len(all_cell_rank), len(nb_pos)*neg_ratio)
        nb_rank = nb.copy().reset_index().set_index('rank')
        nb_neg = nb_rank.iloc[list(cell_ind),].reset_index()
        nb_neg['md_id'] = nb_pos.loc[nb_pos.index.repeat(neg_ratio)]['md_id'].values
        nb_neg = nb_neg[['md_id','cell_id']]
        nb_neg['label'] = 0

        df_posneg = pd.concat([nb_pos, nb_neg]).reset_index(drop=True)

        dict_pair[nb_id] = df_posneg.to_dict()
        
    df_pairs = pd.concat({k: pd.DataFrame(v) for k, v in dict_pair.items()})
    df_pairs = df_pairs.drop_duplicates(subset=['md_id','cell_id'], keep='first')

    return df_pairs

In [9]:
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors

from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids = df.index.unique('id')
ancestors = df_ancestors.loc[ids, 'ancestor_id']
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

df_pairs_train = get_dataset(df_train)
df_pairs_valid = get_dataset(df_valid)

Sorting a notebook by the cell ranks is another way to order the notebook.

In [10]:
df_proc = df.copy().reset_index()
df_proc = df_proc[['cell_id','source']].set_index('cell_id')
df_proc

In [11]:
from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch

MAX_LEN = 128

class MarkdownModel(nn.Module):
    def __init__(self):
        super(MarkdownModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.top = nn.Linear(768, 1)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = self.top(x[:, 0, :])
        return x.squeeze(1)

In [12]:
MAX_LEN = 200

In [13]:
df_pairs_train

In [14]:
df_pairs_train.reset_index().iloc[100]

In [15]:
from torch.utils.data import DataLoader, Dataset

class MarkdownDataset(Dataset):
    
    def __init__(self, df, df_source, max_len):
        super().__init__()
        self.df = df.reset_index()
        self.df_source = df_source
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def __getitem__(self, index):
        row = self.df.iloc[index]
        nb = row['level_0']
        md_idx = row['md_id']
        md_source = self.df_source.loc[md_idx].values
        code_idx = row['cell_id']
        code_source = self.df_source.loc[code_idx].values
        label = row['label']
#         print(row)
        
        inputs_md = self.tokenizer.encode_plus(
            (md_source+code_source)[0],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs_md['input_ids'])
        mask = torch.LongTensor(inputs_md['attention_mask'])

        return [nb, md_idx, code_idx], ids, mask, label

    def __len__(self):
        return self.df.shape[0]
    
train_ds = MarkdownDataset(df_pairs_train, df_proc, max_len=MAX_LEN)
val_ds = MarkdownDataset(df_pairs_valid, df_proc, max_len=MAX_LEN)

val_ds[100]

In [16]:
BS = 16
NW = 8

train_dl = DataLoader(train_ds, batch_size=BS, shuffle=True, num_workers=NW, pin_memory=False, drop_last=True)
valid_dl = DataLoader(val_ds, batch_size=BS, shuffle=False, num_workers=NW, pin_memory=False, drop_last=True)

In [17]:
def iterDataLoader(dataloader, model, lr, wd, train=False):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    ys = []
    ys_pred = []
    losses = []
    pair_info = []
    for pair_idx, idx, mask, y in tqdm(dataloader):
        y_pred = model(idx.cuda(), mask.cuda()).float()
        y = y.cuda().float()
#         print(y, y_pred)
        loss = F.binary_cross_entropy_with_logits(y_pred, y)
        
        # option to update parameters
        if train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step() 
        losses.append(loss.detach().cpu().item())
        ys.append(y.detach().cpu().numpy())
        ys_pred.append(y_pred.detach().cpu().numpy())
        pair_info.append(pair_idx)

    loss = np.mean(losses)
    accu = accuracy_score(np.concatenate(ys), np.where(np.concatenate(ys_pred)> 0.5, 1, 0))
    pairs_info = np.concatenate(pair_info)
    return loss, accu, pairs_info

In [18]:
# training a model
def train_epocs(model, train_dl, valid_dl, n_epoch=5, lr=0.01, wd=0.0):
    train_losses = []
    train_accues = []
    valid_losses = []
    valid_accues = []
    train_pairs = []
    valid_pairs = []
    for i in range(n_epoch):
        model.train()
        train_loss, train_accu, train_pair = iterDataLoader(train_dl, model, lr, wd, train=True)

        train_losses.append(train_loss)
        train_accues.append(train_accu)
        train_pairs.extend(train_pair)

        model.eval()
        valid_loss, valid_accu, valid_pair = iterDataLoader(valid_dl, model, lr, wd, train=False)

        valid_losses.append(valid_loss)
        valid_accues.append(valid_accu)
        valid_pairs.extend(valid_pair)

        print("----- Epoch %.0f -----\ntrain loss %.3f and valid loss %.3f, train accuracy %.3f and valid accuracy %.3f"\
                 % (i+1, train_loss, valid_loss, train_accu, valid_accu)) 

    return train_losses, valid_losses, train_accues, valid_accues, train_pairs, valid_pairs

In [19]:
model = MarkdownModel()
model = model.cuda()

In [20]:
import torch.optim as optim

train_losses, valid_losses, train_accues, valid_accues, train_pairs, valid_pairs = train_epocs(model, train_dl, valid_dl)

In [21]:
torch.save(model.state_dict(), 'mymodel.pt')

# inference

In [22]:
# model = TheModelClass(*args, **kwargs)
# model.load_state_dict(torch.load(PATH))
# model.eval()

In [23]:
paths_train = list((data_dir / 'train').glob('*.json'))[10000:10001]
infer_df = [read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')]
infer_nb = (
    pd.concat(infer_df)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False))
infer_nb

In [24]:
nb_id = 'daf98d865b60d6'

In [25]:
nb_infer1 = infer_nb.reset_index()
nb_infer2 = nb_infer1.copy()
nb_infer1['key'] = 0
nb_infer2['key'] = 0

nb_infer = nb_infer1.merge(nb_infer2, on='key', how='outer')
nb_infer = nb_infer[(nb_infer['cell_type_x']=='markdown')& (nb_infer['cell_type_y']=='code')]
nb_infer['label'] = 999

In [26]:
nb_infer

In [27]:
nb_infer_idx = nb_infer[['id_x','cell_id_x', 'cell_id_y', 'label']]
nb_infer_idx.columns = ['level_0', 'md_id', 'cell_id','label']
nb_infer_idx

In [28]:
nb_infer_proc = nb_infer1.copy().reset_index()
nb_infer_proc = nb_infer_proc[['cell_id','source']].set_index('cell_id')
nb_infer_proc

In [29]:
infer_ds = MarkdownDataset(nb_infer_idx, nb_infer_proc, max_len=MAX_LEN)
infer_ds[100]

In [30]:
infer_dl = DataLoader(infer_ds, batch_size=32, shuffle=True, num_workers=NW, pin_memory=False, drop_last=True)

In [31]:
# model = model.cpu()

In [32]:
y_preds = []
pair_info = []

model.eval()
for pair_idx, idx, mask, _ in tqdm(infer_dl):
    y_pred = model(idx.cuda(), mask.cuda()).float()
    y_preds.append(y_pred.detach().cpu().numpy())
    pair_info.append(pair_idx)

In [33]:
pair_info_df = pd.DataFrame(np.concatenate(pair_info, axis=1)).T
pair_info_df['prob'] = F.sigmoid(torch.tensor(np.concatenate(y_preds)))
pair_info_df.columns = ['nb_id', 'md_id','code_id','prob']

In [34]:
idx = pair_info_df.groupby(['md_id'])['prob'].transform(max) == pair_info_df['prob']
pair_info_df[idx]