# SET UP

In [1]:
import glob
import json
import os
from typing import Optional, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import AutoModel, AutoTokenizer
from IPython.display import display
from sklearn.utils import shuffle
from sklearn.model_selection import GroupKFold
from tensorflow.keras.utils import plot_model
from tqdm.notebook import tqdm
from tqdm import tqdm
from scipy import spatial
from collections import OrderedDict
from numpy import genfromtxt

import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import copy 
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

In [2]:
DATA_PATH = "../input/AI4Code"
BASE_MODEL = '../input/huggingface-bert-variants/distilbert-base-multilingual-cased/distilbert-base-multilingual-cased'
N_SPLITS = 5
SEQ_LEN = 128
RANDOM_STATE = 42
LIMIT = 1000

try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    STRATEGY = tf.distribute.experimental.TPUStrategy(TPU)
    BATCH_SIZE = 128 * STRATEGY.num_replicas_in_sync
except Exception:
    TPU = None
    STRATEGY = tf.distribute.get_strategy()
    BATCH_SIZE = 32
    #LIMIT = 10_000

print("TensorFlow", tf.__version__)

if TPU is not None:
    print("Using TPU v3-8")
else:
    print("Using GPU/CPU")

print("Batch size:", BATCH_SIZE)

TensorFlow 2.6.4
Using GPU/CPU
Batch size: 32


# HELPER FUNCTIONS

In [3]:
def read_notebook(path):
    with open(path) as file:
        df = pd.DataFrame(json.load(file))
    df["id"] = os.path.splitext(os.path.basename(path))[0]
    return df

def clean_source(sources):
    clean_source = copy.deepcopy(sources)
    
    for i, source in enumerate(tqdm(clean_source)):
        source = source.lower()
        source = re.sub("[^\w\s]", " ", source)
        source = source.split()
        
        source_nostop = []
        for word in source:
            if word not in stopwords.words():
                source_nostop.append(word)
                
        lemmatizer = WordNetLemmatizer()
        source_lem = []
        for word in source_nostop:
            source_lem.append(lemmatizer.lemmatize(word))
        
        source = " ".join(source_lem)
        clean_source[i] = source
    
    return clean_source

def expand_order(row):
    cell_ids = row[1].split(" ")
    df = pd.DataFrame(
        {
            "id": [row[0] for _ in range(len(cell_ids))],
            "cell_id": cell_ids,
            "rank": range(len(cell_ids)),
        }
    )
    
    return df

In [4]:
def get_input_ids(df, notebook_id, maxlen, tokenizer):
    # get lists of code_ids, input_ids (from tokenizer), source - all separated based on code/markdown
    df_temp = df.loc[df['id'] == notebook_id]
    
    code = df_temp[df_temp['cell_type'] == 'code']
    code_ids = code['cell_id'].values.tolist() 
    code_sources = code['clean_source'].values.tolist()  
    
    code_encoded = tokenizer.batch_encode_plus(code_sources, add_special_tokens=True, 
                                               return_token_type_ids=True, truncation = True,
                                               padding='max_length', max_length=maxlen)
    
    markdown = df_temp[df_temp['cell_type'] == 'markdown']
    markdown_ids = markdown['cell_id'].values.tolist()   
    markdown_sources = markdown['clean_source'].values.tolist()   
    
    markdown_encoded = tokenizer.batch_encode_plus(markdown_sources, add_special_tokens=True, 
                                               return_token_type_ids=True, truncation = True,
                                               padding='max_length', max_length=maxlen)
    
    return (code_ids, code_encoded['input_ids'], code_sources, 
            markdown_ids, markdown_encoded['input_ids'], markdown_sources)


def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

def max_similarity(nb_id, code_cell_ids, code_input_ids, mkdn_cell_ids, mkdn_input_ids):
    # get markdown-code pair with the highest jaccard similarity (based on input_ids)
    pairings = []
    
    for m_counter in range(len(mkdn_cell_ids)):
        max_similarity = 0 
        best_c_id = 0
        current_m_id = mkdn_cell_ids[m_counter]
        current_m_ii = mkdn_input_ids[m_counter]
        
        for c_counter in range(len(code_cell_ids)):
            current_c_id = code_cell_ids[c_counter]
            current_c_ii = code_input_ids[c_counter]
            
            temp_similarity = jaccard_similarity(current_m_ii, current_c_ii)
            if temp_similarity > max_similarity:
                max_similarity = temp_similarity 
                best_c_id = current_c_id
            
        pairings.append([current_m_id, best_c_id])
    
    return pairings

In [5]:
def get_closest_code_rank(m_rank, code_ranks):  
    min_dis = 0 
    closest_rank = 0 
    
    code_ranks_sorted = copy.deepcopy(code_ranks)
    code_ranks_sorted.sort()

    for i, c_rank in enumerate(code_ranks_sorted):
        distance = abs(m_rank - c_rank)
        prev_distance = abs(m_rank - code_ranks[i-1])
        if i == 0:
            min_dis = distance
            closest_rank = c_rank
        else:
            if (distance < min_dis) and (c_rank < m_rank):
                min_dis = distance
                closest_rank = c_rank
    
    return closest_rank

def determine_labels(df, notebook_ids):
    labels = []
    for nb_id in notebook_ids:
        df_temp = df.loc[df['id'] == nb_id]
           
        # get cumcounts for code and markdown
        code_count = df_temp.loc[df['cell_type'] == 'code']['cum_count'].values.tolist()
        markdown_count = df_temp.loc[df['cell_type'] == 'markdown']['cum_count'].values.tolist()
        
        # get max cumcounts for code and markdown 
        max_code_count = max(code_count)
        max_markdown_count = max(markdown_count)
         
        # calculate code labels 
        code_labels = [(count+1)/(max_code_count+2) for count in code_count]
        
        # get min code labels 
        min_code_label = min(code_labels)
        
        # calculate markdown labels 
        markdown_labels = []
        for m_count in markdown_count:
            markdown_labels.append(-1)
                
        labels += code_labels 
        labels += markdown_labels 
        
    df['labels'] = labels 

In [6]:
def get_inputs(df, notebook_ids, pairings, maxlen):   
    markdown_count = []
    code_count = []
    code_label = []
    base_label = []
    
    
    input_ids = []
    attention_mask = []
    token_type_ids = []
    
    for nb_id in tqdm(notebook_ids):
        pairs = pairings[nb_id]
        for pair in pairs:
            
            markdown_id = pair[0]
            code_id = pair[1]
            
            m_count = df.loc[df['cell_id'] == markdown_id]['cum_count'].values.tolist()
            c_count = df.loc[df['cell_id'] == code_id]['cum_count'].values.tolist()
            c_label = df.loc[df['cell_id'] == code_id]['labels'].values.tolist()
            b_label = df.loc[(df['id'] == nb_id) & (df['cum_count'] == 0)]['labels'].values.tolist()
            
            markdown_count.append(m_count[0])
            code_count.append(c_count[0])
            code_label.append(c_label[0])
            base_label.append(b_label[0])
            
            # bert inputs
            
            m_source = df.loc[df['cell_id'] == markdown_id]['clean_source'].values.tolist()
            c_source = df.loc[df['cell_id'] == code_id]['clean_source'].values.tolist()
            
            encoding = tokenizer.encode_plus([m_source[0], c_source[0]], add_special_tokens=True, 
                                               return_token_type_ids=True, truncation = True,
                                               padding='max_length', max_length=maxlen)
            
            input_ids.append(encoding['input_ids'])
            attention_mask.append(encoding['attention_mask'])
            token_type_ids.append(encoding['token_type_ids'])
    
    return (input_ids, attention_mask, token_type_ids, markdown_count, code_count, code_label, base_label)

def get_labels(df, notebook_ids, pairings):
    labels = []
    
    for nb_id in notebook_ids:
        pairs = pairings[nb_id]
        for pair in pairs:
            markdown_id = pair[0]
            
            temp_m_out = df.loc[df['cell_id'] == markdown_id]['labels'].values.tolist()
            
            labels.append(temp_m_out[0])
    
    return labels

In [7]:
def get_dataset(input_ids, attention_mask,  
                markdown_count, code_count, code_label, base_label, 
                labels = None, ordered = False, repeated = False):
    
    if labels is not None:
        dataset = tf.data.Dataset.from_tensor_slices(
            ({"input_ids": input_ids, "attention_mask": attention_mask, 
              "markdown_count": markdown_count, "code_count": code_count, 
             "code_label": code_label, "base_label": base_label}, labels)
        )
    else:
        dataset = tf.data.Dataset.from_tensor_slices(
            {"input_ids": input_ids, "attention_mask": attention_mask, 
              "markdown_count": markdown_count, "code_count": code_count, 
             "code_label": code_label, "base_label": base_label}
        )
        
    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(1024)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [8]:
def get_model(): 
    backbone = transformers.TFDistilBertModel.from_pretrained(BASE_MODEL)
    
    input_ids = tf.keras.layers.Input(
        shape=(SEQ_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        shape=(SEQ_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    
    markdown_count = tf.keras.layers.Input(shape=(1, ), name="markdown_count")
    code_count = tf.keras.layers.Input(shape=(1, ), name="code_count")
    code_label = tf.keras.layers.Input(shape=(1, ), name="code_label")
    base_label = tf.keras.layers.Input(shape=(1, ), name="base_label")
    
    
    concat = tf.keras.layers.Concatenate()([markdown_count, code_count, code_label, base_label])
    
    x = backbone(
        {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        },
    )
    
    x = tf.keras.layers.Dense(32, activation="linear", dtype="float32")(x[0][:, 0, :])
    
    y = tf.keras.layers.Dense(32)(concat)
    
    xy = tf.keras.layers.Concatenate()([x, y])
    
    outputs = tf.keras.layers.Dense(1, activation='linear', dtype='float32')(xy)

    model = tf.keras.Model(
        inputs=[input_ids, attention_mask, markdown_count, code_count, code_label, base_label],
        outputs=outputs,
    )
    
    model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.MeanSquaredError()
    )
    
    return model

# CREATE DATASETS

In [9]:
paths = glob.glob(os.path.join(DATA_PATH, "train", "*.json"))
paths = paths[1000:1500]

source_df = pd.concat([read_notebook(x) for x in tqdm(paths, total=len(paths))])
source_df = source_df.rename_axis('cell_id').reset_index()

order_df_ = pd.read_csv(os.path.join(DATA_PATH, "train_orders.csv"), index_col="id")
order_df = pd.concat(
    [expand_order(row) for row in tqdm(order_df_.itertuples(), total=len(order_df_))]
)


ancestors_df = pd.read_csv(
    os.path.join(DATA_PATH, "train_ancestors.csv"),
    usecols=["id", "ancestor_id"],
    index_col="id",
)

df_test = source_df.merge(order_df, on=["id", "cell_id"]).merge(ancestors_df, on='id')
df_test = df_test.dropna()
df_test.head()

100%|██████████| 500/500 [00:03<00:00, 132.10it/s]
100%|██████████| 139256/139256 [01:06<00:00, 2107.31it/s]


Unnamed: 0,cell_id,cell_type,source,id,rank,ancestor_id
0,257649ff,code,# This Python 3 environment comes with many he...,70d14942600e8f,0,9172097
1,6941928d,code,import matplotlib.pyplot as plt\nimport seabor...,70d14942600e8f,1,9172097
2,e36ba491,code,df1 = pd.read_csv('../input/customer-segmentat...,70d14942600e8f,2,9172097
3,dfa2cc30,code,df1.head(),70d14942600e8f,3,9172097
4,ace48285,code,df1.isnull().sum(),70d14942600e8f,4,9172097


In [11]:
cleaned_source = clean_source(df_test['source'])
df_test['clean_source'] = cleaned_source
df_test.head()

100%|██████████| 24098/24098 [1:10:14<00:00,  5.72it/s]  


Unnamed: 0,cell_id,cell_type,source,id,rank,ancestor_id,clean_source
0,257649ff,code,# This Python 3 environment comes with many he...,70d14942600e8f,0,9172097,python 3 environment come many helpful analyti...
1,6941928d,code,import matplotlib.pyplot as plt\nimport seabor...,70d14942600e8f,1,9172097,import matplotlib pyplot plt import seaborn sn
2,e36ba491,code,df1 = pd.read_csv('../input/customer-segmentat...,70d14942600e8f,2,9172097,df1 pd read_csv input customer segmentation tu...
3,dfa2cc30,code,df1.head(),70d14942600e8f,3,9172097,df1 head
4,ace48285,code,df1.isnull().sum(),70d14942600e8f,4,9172097,df1 isnull sum


In [12]:
notebook_ids = pd.unique(df_test['id'])

tokenizer = transformers.AutoTokenizer.from_pretrained(BASE_MODEL)

code_cell_ids = []
code_input_ids = []
code_sources = {}

mkdn_cell_ids = []
mkdn_input_ids = []
mkdn_sources = {}


for nb_id in tqdm(notebook_ids):
    tci, tce, tcs, tmi, tme, tms = get_input_ids(df_test, nb_id, SEQ_LEN, tokenizer)
    code_cell_ids.append(tci) 
    code_input_ids.append(tce) 
    code_sources[nb_id] = tcs
    
    mkdn_cell_ids.append(tmi)
    mkdn_input_ids.append(tme) 
    mkdn_sources[nb_id] = tms

100%|██████████| 500/500 [00:05<00:00, 91.35it/s] 


In [13]:
pairings = {}
for i in range(len(notebook_ids)):
    pair = max_similarity(notebook_ids[i], 
                                 code_cell_ids[i], code_input_ids[i], 
                                 mkdn_cell_ids[i], mkdn_input_ids[i])
    
    pairings[notebook_ids[i]] = pair

In [14]:
df_test['cum_count'] = df_test.groupby(['id', 'cell_type']).cumcount()
determine_labels(df_test, notebook_ids)
df_test.head(10)

Unnamed: 0,cell_id,cell_type,source,id,rank,ancestor_id,clean_source,cum_count,labels
0,257649ff,code,# This Python 3 environment comes with many he...,70d14942600e8f,0,9172097,python 3 environment come many helpful analyti...,0,0.013889
1,6941928d,code,import matplotlib.pyplot as plt\nimport seabor...,70d14942600e8f,1,9172097,import matplotlib pyplot plt import seaborn sn,1,0.027778
2,e36ba491,code,df1 = pd.read_csv('../input/customer-segmentat...,70d14942600e8f,2,9172097,df1 pd read_csv input customer segmentation tu...,2,0.041667
3,dfa2cc30,code,df1.head(),70d14942600e8f,3,9172097,df1 head,3,0.055556
4,ace48285,code,df1.isnull().sum(),70d14942600e8f,4,9172097,df1 isnull sum,4,0.069444
5,0e2b7be6,code,df1.describe(),70d14942600e8f,5,9172097,df1 describe,5,0.083333
6,0e2b307f,code,sns.countplot(df1['Gender']),70d14942600e8f,9,9172097,sn countplot df1 gender,6,0.097222
7,79f1bafc,code,df1['Age'].value_counts().head(),70d14942600e8f,12,9172097,df1 age value_counts head,7,0.111111
8,de3cb9ab,code,"ageplot = sns.distplot(df1['Age'],bins=5)",70d14942600e8f,13,9172097,ageplot sn distplot df1 age bin 5,8,0.125
9,9c442477,code,"sns.boxplot(x='Gender',y='Age',data=df1)",70d14942600e8f,14,9172097,sn boxplot x gender age data df1,9,0.138889


In [15]:
input_ids, attention_mask, token_ids, markdown_count, code_count, code_label, base_label = get_inputs(
    df_test, notebook_ids, pairings, SEQ_LEN)

labels = get_labels(df_test, notebook_ids, pairings)

100%|██████████| 500/500 [02:02<00:00,  4.09it/s]


In [16]:
input_ids_np = np.asarray(input_ids)
attention_mask_np = np.asarray(attention_mask)
token_ids_np = np.asarray(token_ids)
markdown_count_np = np.asarray(markdown_count)
code_count_np = np.asarray(code_count)
code_label_np = np.asarray(code_label)
base_label_np = np.asarray(base_label)
labels_np = np.asarray(labels)

# INFERENCE

In [17]:
reloaded_model = get_model()
reloaded_model.summary()
reloaded_model.load_weights("../input/distilmodelweights/model_weights.h5")

2022-09-30 03:17:18.728604: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-09-30 03:17:18.759146: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at ../input/huggingface-bert-variants/distilbert-base-multilingual-cased/distilbert-base-multilingual-cased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expe

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB TFBaseModelOutput(la 134734080   attention_mask[0][0]             
                                                                 input_ids[0][0]                  
__________________________________________________________________________________________________
markdown_count (InputLayer)     [(None, 1)]          0                                        

In [18]:
test_dataset = get_dataset(input_ids_np, attention_mask_np,markdown_count_np,
                           code_count_np,code_label_np, base_label_np, ordered=True)

In [19]:
predictions = reloaded_model.predict(test_dataset)

2022-09-30 03:17:39.056384: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [21]:
df_test.loc[df_test["cell_type"] == "markdown", "labels"] = predictions
df_test.tail(15)

Unnamed: 0,cell_id,cell_type,source,id,rank,ancestor_id,clean_source,cum_count,labels
24083,d3bca72e,markdown,**Accuracy of linear and ridge are more or les...,09b99549aeb019,37,d6be9b28,accuracy linear ridge le coefficient value sim...,13,0.665109
24084,00a5d650,markdown,"**""displacement"",""horsepower"",""weight"" columns...",09b99549aeb019,16,d6be9b28,displacement horsepower weight column right sk...,14,0.478611
24085,220a0d52,markdown,**Many of the coefficients have become 0 so we...,09b99549aeb019,33,d6be9b28,many coefficient become 0 drop dimension model...,15,0.591865
24086,7d7952e5,markdown,**Lasso making many coeficients to zero. As me...,09b99549aeb019,54,d6be9b28,lasso making many coeficients zero mentioned u...,16,0.583408
24087,1a603411,markdown,# Separate independent and dependent variables,09b99549aeb019,12,d6be9b28,separate independent dependent variable,17,0.453316
24088,0422cfa8,markdown,**Lasso is getting 86% accuracy with nearly ha...,09b99549aeb019,56,d6be9b28,lasso getting 86 accuracy nearly half number d...,18,0.679201
24089,c1f0fd75,markdown,# Importing Neccessary Packages,09b99549aeb019,0,d6be9b28,importing neccessary package,19,0.288517
24090,d1daf17b,markdown,**Our dimensions are increased to 29**,09b99549aeb019,44,d6be9b28,dimension increased 29,20,0.851936
24091,059e88eb,markdown,**To build machine learning model we need to h...,09b99549aeb019,5,d6be9b28,build machine learning model need number every...,21,0.211954
24092,b9f0b3ee,markdown,**origin 1 is more compared to others**\n\n,09b99549aeb019,18,d6be9b28,origin 1 compared others,22,0.629013


In [22]:
from bisect import bisect


def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        gt_ = gt[0].split()
        pred_ = pred[0].split()
        ranks = [gt_.index(x) for x in pred_]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt_)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [23]:
sub = df_test.sort_values("labels").groupby("id", as_index=False)["cell_id"].apply(lambda x: " ".join(x))
sub.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub.sort_values(by=['id'])
display(sub)

Unnamed: 0,id,cell_order
0,0023fe53ace4bb,0b04591a 5214fe1e 46f9d46f 4287afca 9fe41d6a d...
1,004d12fbcc887d,6f2590cf 65212cde 7d8fc80f 66cea36b 242a777f e...
2,009d4604b7096d,56f8f26c 91eb0252 2cca726f 08168e1f cb9000a8 3...
3,00d66816755434,c3a96e79 1782e8b5 4f1ee562 d5a7980f f616a608 1...
4,01c82aed3efb00,0c09a46f cf8d6467 8737e252 8ad1504e fe01cf67 a...
...,...,...
495,fcdf6606e4d656,e944a6d9 4c5062b4 e2086e24 53efb8f0 6b2798ac c...
496,fdfc12ab30ec8c,874ffedf 82c132ec 3ab8dbc3 71d58846 04e5e770 8...
497,fe75b783710d1a,4d60b707 6d17cb5f 3a02aeb1 2d646d78 af2febab d...
498,feff9fad536320,02dfc1e8 27170685 57f31360 192b2420 5dee46eb 3...


In [24]:
valid_df = order_df_.loc[notebook_ids]
valid_df = valid_df.sort_values(by=['id'])
valid_df.head()

Unnamed: 0_level_0,cell_order
id,Unnamed: 1_level_1
0023fe53ace4bb,4142961d 46f9d46f 37365702 0b04591a 5214fe1e 4...
004d12fbcc887d,6f2590cf 5c32217f 65212cde 8740e320 7d8fc80f 3...
009d4604b7096d,e01b01fb 56f8f26c e5c8b773 91eb0252 2cca726f 0...
00d66816755434,f220c26c 1cab8f13 c3a96e79 f212c08b 1782e8b5 4...
01c82aed3efb00,0c09a46f cf8d6467 8737e252 8ad1504e fe01cf67 a...


In [25]:
y_valid = valid_df.loc[notebook_ids].groupby("id")["cell_order"].apply(list)
y_valid.head(10)

id
0023fe53ace4bb    [4142961d 46f9d46f 37365702 0b04591a 5214fe1e ...
004d12fbcc887d    [6f2590cf 5c32217f 65212cde 8740e320 7d8fc80f ...
009d4604b7096d    [e01b01fb 56f8f26c e5c8b773 91eb0252 2cca726f ...
00d66816755434    [f220c26c 1cab8f13 c3a96e79 f212c08b 1782e8b5 ...
01c82aed3efb00    [0c09a46f cf8d6467 8737e252 8ad1504e fe01cf67 ...
0339a0c6830962    [e87de264 f0cafd2c 25d303e5 645f9286 46afe862 ...
03d079ad92e600    [2696c961 92ddc002 562a1e42 b1ca2487 81a14628 ...
04164cc6d74a0c    [c8d28060 4b44e382 6396186e 5f9b6431 98ce2146 ...
04399d96fd2629    [b78a0392 0b0b39a6 feac010b 36ab8e53 9748e779 ...
0460990b496a60    [33d01c16 0b692dae df48eb7d c28ed07c 002bc9a9 ...
Name: cell_order, dtype: object

In [26]:
preds_df = sub.set_index("id")
y_preds = preds_df.loc[notebook_ids].groupby("id")["cell_order"].apply(list)
y_preds.head(10)

id
0023fe53ace4bb    [0b04591a 5214fe1e 46f9d46f 4287afca 9fe41d6a ...
004d12fbcc887d    [6f2590cf 65212cde 7d8fc80f 66cea36b 242a777f ...
009d4604b7096d    [56f8f26c 91eb0252 2cca726f 08168e1f cb9000a8 ...
00d66816755434    [c3a96e79 1782e8b5 4f1ee562 d5a7980f f616a608 ...
01c82aed3efb00    [0c09a46f cf8d6467 8737e252 8ad1504e fe01cf67 ...
0339a0c6830962    [e87de264 f0cafd2c 645f9286 4d6cc01f e165a9d5 ...
03d079ad92e600    [49fe1d92 81a14628 2696c961 562a1e42 92ddc002 ...
04164cc6d74a0c    [c8d28060 4b44e382 6396186e 5f9b6431 98ce2146 ...
04399d96fd2629    [b78a0392 0b0b39a6 feac010b 36ab8e53 9748e779 ...
0460990b496a60    [0b692dae df48eb7d c28ed07c 4f4fd574 002bc9a9 ...
Name: cell_order, dtype: object

In [27]:
kendall_tau(y_valid, y_preds)

0.6662800640655739