# Set up

In [1]:
import glob
import json
import os
from typing import Optional, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import AutoModel, AutoTokenizer
from IPython.display import display
from sklearn.utils import shuffle
from sklearn.model_selection import GroupKFold
from tensorflow.keras.utils import plot_model
from tqdm.notebook import tqdm
from tqdm import tqdm
from scipy import spatial
from collections import OrderedDict
from numpy import genfromtxt

import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import copy 
nltk.download('stopwords')

In [2]:
DATA_PATH = "../input/AI4Code"
BASE_MODEL = '../input/huggingface-bert-variants/distilbert-base-multilingual-cased/distilbert-base-multilingual-cased'
N_SPLITS = 5
SEQ_LEN = 128
RANDOM_STATE = 42
LIMIT = 1000

try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    STRATEGY = tf.distribute.experimental.TPUStrategy(TPU)
    BATCH_SIZE = 128 * STRATEGY.num_replicas_in_sync
except Exception:
    TPU = None
    STRATEGY = tf.distribute.get_strategy()
    BATCH_SIZE = 32

print("TensorFlow", tf.__version__)

if TPU is not None:
    print("Using TPU v3-8")
else:
    print("Using GPU/CPU")

print("Batch size:", BATCH_SIZE)

# Helper functions 

In [3]:
def read_notebook(path):
    with open(path) as file:
        df = pd.DataFrame(json.load(file))
    df["id"] = os.path.splitext(os.path.basename(path))[0]
    return df

def clean_source(sources):
    clean_source = copy.deepcopy(sources)
    
    for i, source in enumerate(tqdm(clean_source)):
        source = source.lower()
        source = re.sub("[^\w\s]", " ", source)
        source = source.split()
        
        source_nostop = []
        for word in source:
            if word not in stopwords.words():
                source_nostop.append(word)
                
        lemmatizer = WordNetLemmatizer()
        source_lem = []
        for word in source_nostop:
            source_lem.append(lemmatizer.lemmatize(word))
        
        source = " ".join(source_lem)
        clean_source[i] = source
    
    return clean_source

def expand_order(row):
    cell_ids = row[1].split(" ")
    df = pd.DataFrame(
        {
            "id": [row[0] for _ in range(len(cell_ids))],
            "cell_id": cell_ids,
            "rank": range(len(cell_ids)),
        }
    )
    
    return df

In [4]:
def get_input_ids(df, notebook_id, maxlen, tokenizer):
    # get lists of code_ids, input_ids (from tokenizer), source - all separated based on code/markdown
    df_temp = df.loc[df['id'] == notebook_id]
    
    code = df_temp[df_temp['cell_type'] == 'code']
    code_ids = code['cell_id'].values.tolist() 
    code_sources = code['clean_source'].values.tolist()  
    
    code_encoded = tokenizer.batch_encode_plus(code_sources, add_special_tokens=True, 
                                               return_token_type_ids=True, truncation = True,
                                               padding='max_length', max_length=maxlen)
    
    markdown = df_temp[df_temp['cell_type'] == 'markdown']
    markdown_ids = markdown['cell_id'].values.tolist()   
    markdown_sources = markdown['clean_source'].values.tolist()   
    
    markdown_encoded = tokenizer.batch_encode_plus(markdown_sources, add_special_tokens=True, 
                                               return_token_type_ids=True, truncation = True,
                                               padding='max_length', max_length=maxlen)
    
    return (code_ids, code_encoded['input_ids'], code_sources, 
            markdown_ids, markdown_encoded['input_ids'], markdown_sources)


def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

def max_similarity(nb_id, code_cell_ids, code_input_ids, mkdn_cell_ids, mkdn_input_ids):
    # get markdown-code pair with the highest jaccard similarity (based on input_ids)
    pairings = []
    
    for m_counter in range(len(mkdn_cell_ids)):
        max_similarity = 0 
        best_c_id = 0
        current_m_id = mkdn_cell_ids[m_counter]
        current_m_ii = mkdn_input_ids[m_counter]
        
        for c_counter in range(len(code_cell_ids)):
            current_c_id = code_cell_ids[c_counter]
            current_c_ii = code_input_ids[c_counter]
            
            temp_similarity = jaccard_similarity(current_m_ii, current_c_ii)
            if temp_similarity > max_similarity:
                max_similarity = temp_similarity 
                best_c_id = current_c_id
            
        pairings.append([current_m_id, best_c_id])
    
    return pairings

In [5]:
def get_closest_code_rank(m_rank, code_ranks):  
    min_dis = 0 
    closest_rank = 0 
    
    code_ranks_sorted = copy.deepcopy(code_ranks)
    code_ranks_sorted.sort()

    for i, c_rank in enumerate(code_ranks_sorted):
        distance = abs(m_rank - c_rank)
        prev_distance = abs(m_rank - code_ranks[i-1])
        if i == 0:
            min_dis = distance
            closest_rank = c_rank
        else:
            if (distance < min_dis) and (c_rank < m_rank):
                min_dis = distance
                closest_rank = c_rank
    
    return closest_rank

def determine_labels(df, notebook_ids):
    labels = []
    for nb_id in notebook_ids:
        df_temp = df.loc[df['id'] == nb_id]
        
        # get ranks for code and markdown 
        code_ranks = df_temp.loc[df['cell_type'] == 'code']['rank'].values.tolist()
        markdown_ranks = df_temp.loc[df['cell_type'] == 'markdown']['rank'].values.tolist()
        
        
        # get cumcounts for code and markdown
        code_count = df_temp.loc[df['cell_type'] == 'code']['cum_count'].values.tolist()
        markdown_count = df_temp.loc[df['cell_type'] == 'markdown']['cum_count'].values.tolist()
        
        # get max cumcounts for code and markdown 
        max_code_count = max(code_count)
        max_markdown_count = max(markdown_count)
         
        # calculate code labels 
        code_labels = [(count+1)/(max_code_count+2) for count in code_count]
        code_rank_labels = {code_ranks[i]: code_labels[i] for i in range(len(code_ranks))}
        
        # get min code labels 
        min_code_label = min(code_labels)
        
        # calculate markdown labels 
        markdown_labels = []
        for m_rank in markdown_ranks:
            closest_c_rank = get_closest_code_rank(m_rank, code_ranks)
            base = (abs(m_rank - closest_c_rank))*(min_code_label/(max_markdown_count+2))

            if m_rank == 0:
                markdown_labels.append(base)
                
            else:
                closest_code_label = code_rank_labels[closest_c_rank]
                m_label = closest_code_label + base
                markdown_labels.append(m_label)
                
        labels += code_labels 
        labels += markdown_labels 
        
    df['labels'] = labels 

In [6]:
def get_inputs(df, notebook_ids, pairings, maxlen):   
    markdown_count = []
    code_count = []
    code_label = []
    base_label = []
    
    groups = []
    
    input_ids = []
    attention_mask = []
    token_type_ids = []
    
    for nb_id in tqdm(notebook_ids):
        pairs = pairings[nb_id]
        for pair in pairs:
            # dataframe inputs
            
            markdown_id = pair[0]
            code_id = pair[1]
            
            m_count = df.loc[df['cell_id'] == markdown_id]['cum_count'].values.tolist()
            c_count = df.loc[df['cell_id'] == code_id]['cum_count'].values.tolist()
            c_label = df.loc[df['cell_id'] == code_id]['labels'].values.tolist()
            group = df.loc[df['cell_id'] == code_id]['ancestor_id'].values.tolist()
            b_label = df.loc[(df['id'] == nb_id) & (df['cum_count'] == 0)]['labels'].values.tolist()
            
            markdown_count.append(m_count[0])
            code_count.append(c_count[0])
            code_label.append(c_label[0])
            groups.append(group[0])
            base_label.append(b_label[0])
            
            # bert inputs
            
            m_source = df.loc[df['cell_id'] == markdown_id]['clean_source'].values.tolist()
            c_source = df.loc[df['cell_id'] == code_id]['clean_source'].values.tolist()
            
            encoding = tokenizer.encode_plus([m_source[0], c_source[0]], add_special_tokens=True, 
                                               return_token_type_ids=True, truncation = True,
                                               padding='max_length', max_length=maxlen)
            
            input_ids.append(encoding['input_ids'])
            attention_mask.append(encoding['attention_mask'])
            token_type_ids.append(encoding['token_type_ids'])
    
    return (input_ids, attention_mask, token_type_ids, markdown_count, code_count, code_label, base_label, groups)

def get_labels(df, notebook_ids, pairings):
    labels = []
    
    for nb_id in notebook_ids:
        pairs = pairings[nb_id]
        for pair in pairs:
            markdown_id = pair[0]
            
            temp_m_out = df.loc[df['cell_id'] == markdown_id]['labels'].values.tolist()
            
            labels.append(temp_m_out[0])
    
    return labels

In [7]:
def get_dataset(input_ids, attention_mask,  
                markdown_count, code_count, code_label, base_label, 
                labels = None, ordered = False, repeated = False):
    # input_ids, attention_mask, token_type_ids
    # the label_rank for the first code cell in that notebook
    # the label_rank for the code cell in the pairing
    
    if labels is not None:
        dataset = tf.data.Dataset.from_tensor_slices(
            ({"input_ids": input_ids, "attention_mask": attention_mask, 
              "markdown_count": markdown_count, "code_count": code_count, 
             "code_label": code_label, "base_label": base_label}, labels)
        )
    else:
        dataset = tf.data.Dataset.from_tensor_slices(
            {"input_ids": input_ids, "attention_mask": attention_mask, 
              "markdown_count": markdown_count, "code_count": code_count, 
             "code_label": code_label, "base_label": base_label}
        )
        
    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(1024)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [8]:
def get_model(): 
    backbone = transformers.TFDistilBertModel.from_pretrained(BASE_MODEL)
    
    input_ids = tf.keras.layers.Input(
        shape=(SEQ_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        shape=(SEQ_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    
    markdown_count = tf.keras.layers.Input(shape=(1, ), name="markdown_count")
    code_count = tf.keras.layers.Input(shape=(1, ), name="code_count")
    code_label = tf.keras.layers.Input(shape=(1, ), name="code_label")
    base_label = tf.keras.layers.Input(shape=(1, ), name="base_label")
    
    
    concat = tf.keras.layers.Concatenate()([markdown_count, code_count, code_label, base_label])
    
    x = backbone(
        {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        },
    )
    
    x = tf.keras.layers.Dense(32, activation="linear", dtype="float32")(x[0][:, 0, :])
    
    y = tf.keras.layers.Dense(32)(concat)
    
    xy = tf.keras.layers.Concatenate()([x, y])
    
    outputs = tf.keras.layers.Dense(1, activation='linear', dtype='float32')(xy)

    model = tf.keras.Model(
        inputs=[input_ids, attention_mask, markdown_count, code_count, code_label, base_label],
        outputs=outputs,
    )
    
    model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.MeanSquaredError()
    )
    
    return model

# CREATE DATASETS

In [9]:
df_trial = pd.read_csv("../input/clean-data/cleaned_data_.csv")
df_trial = df_trial.dropna()
df_trial.head()

In [10]:
notebook_ids = pd.unique(df_trial['id'])

tokenizer = transformers.AutoTokenizer.from_pretrained(BASE_MODEL)

code_cell_ids = []
code_input_ids = []
code_sources = {}

mkdn_cell_ids = []
mkdn_input_ids = []
mkdn_sources = {}


for nb_id in tqdm(notebook_ids):
    tci, tce, tcs, tmi, tme, tms = get_input_ids(df_trial, nb_id, SEQ_LEN, tokenizer)
    code_cell_ids.append(tci) 
    code_input_ids.append(tce) 
    code_sources[nb_id] = tcs
    
    mkdn_cell_ids.append(tmi)
    mkdn_input_ids.append(tme) 
    mkdn_sources[nb_id] = tms

In [11]:
pairings = {}
for i in range(len(notebook_ids)):
    pair = max_similarity(notebook_ids[i], 
                                 code_cell_ids[i], code_input_ids[i], 
                                 mkdn_cell_ids[i], mkdn_input_ids[i])
    
    pairings[notebook_ids[i]] = pair

In [12]:
df_trial['cum_count'] = df_trial.groupby(['id', 'cell_type']).cumcount()

In [13]:
determine_labels(df_trial, notebook_ids)

In [14]:
input_ids, attention_mask, token_ids, markdown_count, code_count, code_label, base_label, groups = get_inputs(
    df_trial, notebook_ids, pairings, SEQ_LEN)

In [15]:
labels = get_labels(df_trial, notebook_ids, pairings)

In [16]:
input_ids_np = np.asarray(input_ids)
attention_mask_np = np.asarray(attention_mask)
token_ids_np = np.asarray(token_ids)
markdown_count_np = np.asarray(markdown_count)
code_count_np = np.asarray(code_count)
code_label_np = np.asarray(code_label)
base_label_np = np.asarray(base_label)
labels_np = np.asarray(labels)
groups_np = np.asarray(groups)

In [17]:
input_ids_np, attention_mask_np, token_ids_np, markdown_count_np, code_count_np, \
code_label_np, base_label_np, labels_np, groups_np = shuffle(
    input_ids_np, attention_mask_np, token_ids_np, markdown_count_np, 
    code_count_np, code_label_np, base_label_np, labels_np, groups_np, random_state=RANDOM_STATE)

# MODEL TRAINING 

In [18]:
if TPU is not None:
    tf.tpu.experimental.initialize_tpu_system(TPU)

    with STRATEGY.scope():
        model = get_model()
        model.summary()
        
else:
    model = get_model()
    model.summary()

In [19]:
kfold = GroupKFold(n_splits=N_SPLITS)

for i, (train_index, val_index) in enumerate(kfold.split(input_ids, labels, groups=groups)):
    print(i)
    train_dataset = get_dataset(input_ids_np[train_index], 
                                attention_mask_np[train_index],
 
                                markdown_count_np[train_index],
                                code_count_np[train_index],
                                code_label_np[train_index],
                                base_label_np[train_index],
                                labels_np[train_index],
                                repeated=False)
    
    val_dataset = get_dataset(input_ids_np[val_index], 
                              attention_mask_np[val_index],

                              markdown_count_np[val_index],
                              code_count_np[val_index],
                              code_label_np[val_index],
                              base_label_np[val_index],
                              labels_np[val_index],
                              ordered=True)
    
    print('model training')
    
    history = model.fit(train_dataset, validation_data=val_dataset, epochs=10, verbose=1)
    break

In [23]:
model.save_weights(f"model_weights.h5")