# AI4Code Pytorch DistilBert Baseline

I used a lot of code from Kaggle's starter notebook here: https://www.kaggle.com/code/ryanholbrook/getting-started-with-ai4code

I replaced their model with a DistilBert model.

In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
import copy
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers.models.longformer.modeling_longformer import LongformerSelfAttention
from transformers import LongformerModel, LongformerTokenizer, LongformerConfig

pd.options.display.width = 180
pd.options.display.max_colwidth = 120
data_dir = Path('../input/AI4Code')

In [2]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
test_df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()
test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

Test NBs: 100%|██████████| 4/4 [00:00<00:00, 97.22it/s]


In [3]:
test_df

Unnamed: 0,id,cell_id,cell_type,source,rank,pred
0,0009d135ece78d,ddfd239c,code,"import numpy as np # linear algebra\nimport pandas as pd # data processing,\nimport matplotlib.pyplot as plt\nfrom s...",0,0.142857
1,0009d135ece78d,c6cd22db,code,df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')\ndf,1,0.285714
2,0009d135ece78d,1372ae9b,code,"numerical_data = df.loc[:, ~df.columns.isin(['id', ""diagnosis""])]\n\nlabels = df[""diagnosis""].factorize(['B','M'])[0...",2,0.428571
3,0009d135ece78d,90ed07ab,code,"def comparison_plot_maker(data_1, data_2, name, column_name_1, column_name_2):\n # Scaling Data for testing\n ...",3,0.571429
4,0009d135ece78d,7f388a41,code,"# Ploting data with different columns\n#####################################\ncomparison_plot_maker(numerical_data[""...",4,0.714286
...,...,...,...,...,...,...
84,0010a919d60e4f,d3f5c397,markdown,We have 177 rows with missing `Age` and 687 rows with missing `Cabin`,34,1.000000
85,0028856e09c5b7,012c9d02,code,"sns.set()\nsns.pairplot(data1, 2.5)\nplt.show(); = size",0,0.333333
86,0028856e09c5b7,d22526d1,code,"types----------"")\n# is uniques----------"")\n# plt\nimport mis_val +\n = #https://pandas.pydata.org/pandas...",1,0.666667
87,0028856e09c5b7,3ae7ece3,code,"#correlation avoid map\nf,ax verbose 20), 18))\nsns.heatmap(data1.corr(), the annot=True, ; informations bins=50, '....",2,1.000000


In [4]:
# Additional code cells
def clean_code(cell):
    return str(cell).replace("\\n", "\n")


def sample_cells(cells, n):
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]
    else:
        results = []
        step = len(cells) / n
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df):
    features = dict()
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")):
        features[idx] = dict()
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[idx]["total_code"] = total_code
        features[idx]["total_md"] = total_md
        features[idx]["codes"] = codes
    return features

In [5]:
test_fts = get_features(test_df)

100%|██████████| 4/4 [00:00<00:00, 866.05it/s]


In [6]:

python_files = sorted(Path('../input/codesearchnet/python/python/final/jsonl/train/').glob('**/*.jsonl'))
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)
pydf = jsonl_list_to_dataframe(python_files)


In [7]:
pydf

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition
0,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition...,"def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):\n """"""\n Tra...","[def, train, (, train_dir, ,, model_save_path, =, None, ,, n_neighbors, =, None, ,, knn_algo, =, 'ball_tree', ,, ver...",Trains a k-nearest neighbors classifier for face recognition.\n\n :param train_dir: directory that contains a sub...,"[Trains, a, k, -, nearest, neighbors, classifier, for, face, recognition, .]",python,train
1,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition...,"def predict(X_img_path, knn_clf=None, model_path=None, distance_threshold=0.6):\n """"""\n Recognizes faces in gi...","[def, predict, (, X_img_path, ,, knn_clf, =, None, ,, model_path, =, None, ,, distance_threshold, =, 0.6, ), :, if, ...",Recognizes faces in given image using a trained KNN classifier\n\n :param X_img_path: path to image to be recogni...,"[Recognizes, faces, in, given, image, using, a, trained, KNN, classifier]",python,train
2,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition...,"def show_prediction_labels_on_image(img_path, predictions):\n """"""\n Shows the face recognition results visuall...","[def, show_prediction_labels_on_image, (, img_path, ,, predictions, ), :, pil_image, =, Image, ., open, (, img_path,...",Shows the face recognition results visually.\n\n :param img_path: path to image to be recognized\n :param pred...,"[Shows, the, face, recognition, results, visually, .]",python,train
3,ageitgey/face_recognition,face_recognition/api.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/face_recognition/api.py#L...,"def _rect_to_css(rect):\n """"""\n Convert a dlib 'rect' object to a plain tuple in (top, right, bottom, left) or...","[def, _rect_to_css, (, rect, ), :, return, rect, ., top, (, ), ,, rect, ., right, (, ), ,, rect, ., bottom, (, ), ,,...","Convert a dlib 'rect' object to a plain tuple in (top, right, bottom, left) order\n\n :param rect: a dlib 'rect' ...","[Convert, a, dlib, rect, object, to, a, plain, tuple, in, (, top, right, bottom, left, ), order]",python,train
4,ageitgey/face_recognition,face_recognition/api.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/face_recognition/api.py#L...,"def _trim_css_to_bounds(css, image_shape):\n """"""\n Make sure a tuple in (top, right, bottom, left) order is wi...","[def, _trim_css_to_bounds, (, css, ,, image_shape, ), :, return, max, (, css, [, 0, ], ,, 0, ), ,, min, (, css, [, 1...","Make sure a tuple in (top, right, bottom, left) order is within the bounds of the image.\n\n :param css: plain t...","[Make, sure, a, tuple, in, (, top, right, bottom, left, ), order, is, within, the, bounds, of, the, image, .]",python,train
...,...,...,...,...,...,...,...,...,...
29995,smdabdoub/phylotoast,bin/extract_shared_or_unique_otuids.py,https://github.com/smdabdoub/phylotoast/blob/0b74ef171e6a84761710548501dfac71285a58a3/bin/extract_shared_or_unique_o...,"def shared_otuids(groups):\n """"""\n Get shared OTUIDs between all unique combinations of groups.\n\n :type g...","[def, shared_otuids, (, groups, ), :, for, g, in, sorted, (, groups, ), :, print, (, ""Number of OTUs in {0}: {1}"", ....",Get shared OTUIDs between all unique combinations of groups.\n\n :type groups: Dict\n :param groups: {Category...,"[Get, shared, OTUIDs, between, all, unique, combinations, of, groups, .]",python,train
29996,smdabdoub/phylotoast,bin/extract_shared_or_unique_otuids.py,https://github.com/smdabdoub/phylotoast/blob/0b74ef171e6a84761710548501dfac71285a58a3/bin/extract_shared_or_unique_o...,"def write_uniques(path, prefix, uniques):\n """"""\n Given a path, the method writes out one file for each group ...","[def, write_uniques, (, path, ,, prefix, ,, uniques, ), :, for, group, in, uniques, :, fp, =, osp, ., join, (, path,...","Given a path, the method writes out one file for each group name in the\n uniques dictionary with the file name i...","[Given, a, path, the, method, writes, out, one, file, for, each, group, name, in, the, uniques, dictionary, with, th...",python,train
29997,smdabdoub/phylotoast,phylotoast/util.py,https://github.com/smdabdoub/phylotoast/blob/0b74ef171e6a84761710548501dfac71285a58a3/phylotoast/util.py#L20-L34,"def storeFASTA(fastaFNH):\n """"""\n Parse the records in a FASTA-format file by first reading the entire file in...","[def, storeFASTA, (, fastaFNH, ), :, fasta, =, file_handle, (, fastaFNH, ), ., read, (, ), return, [, FASTARecord, (...",Parse the records in a FASTA-format file by first reading the entire file into memory.\n\n :type source: path to ...,"[Parse, the, records, in, a, FASTA, -, format, file, by, first, reading, the, entire, file, into, memory, .]",python,train
29998,smdabdoub/phylotoast,phylotoast/util.py,https://github.com/smdabdoub/phylotoast/blob/0b74ef171e6a84761710548501dfac71285a58a3/phylotoast/util.py#L37-L73,"def parseFASTA(fastaFNH):\n """"""\n Parse the records in a FASTA-format file keeping the file open, and reading ...","[def, parseFASTA, (, fastaFNH, ), :, recs, =, [, ], seq, =, [, ], seqID, =, """", descr, =, """", for, line, in, file_ha...","Parse the records in a FASTA-format file keeping the file open, and reading through\n one line at a time.\n\n ...","[Parse, the, records, in, a, FASTA, -, format, file, keeping, the, file, open, and, reading, through, one, line, at,...",python,train


In [8]:
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm
def get_training_corpus():
    to_return = [pydf[i : i + 1000]["code"]
        for i in tqdm(range(0, len(pydf), 1000))]
    to_return.extend([pydf[i : i + 1000]["docstring"]
        for i in tqdm(range(0, len(pydf), 1000))])    
    return to_return
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("../input/codebert-base/codebert-base")

training_corpus = get_training_corpus()


100%|██████████| 413/413 [00:00<00:00, 8423.40it/s]
100%|██████████| 413/413 [00:00<00:00, 11648.96it/s]


In [9]:
#self.model.embeddings.position_embeddings.weight

In [10]:
from tqdm import tqdm
import sys, os
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch

class MarkdownModel(nn.Module):
    def __init__(self, model_path):
#         super(MarkdownModel, self).__init__()
#         self.model = AutoModel.from_pretrained(model_path)
#         self.top = nn.Linear(769, 1)
        
#     def forward(self, ids, mask, fts):
#         x = self.model(ids, mask)[0]
#         x = self.top(torch.cat((x[:, 0, :], fts),1))
#         return x
        super(MarkdownModel, self).__init__()
        #self.max_input_len = 16384
        #self.max_input_len += 2
        self.attention_window = 512
        self.md_max_len = 64
        # lengthen model
        self.model = AutoModel.from_pretrained(model_path)

        self.top = nn.Linear(769, 1)

    def forward(self, ids, mask, fts):
        #global_attention_mask = torch.zeros_like(ids)
        #global_attention_mask[:self.md_max_len] = 0
        #x = self.model(input_ids=ids, attention_mask=mask)[0]
        x = self.model(input_ids=ids, attention_mask=mask)[0]
        #print("fts", fts)
        x = torch.cat((x[:, 0, :], fts), 1)
        #print("/n", x.size())
        x = self.top(x)
        return x
        
from torch.utils.data import DataLoader, Dataset
class MarkdownDataset(Dataset):

    def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        #old_tokenizer = AutoTokenizer.from_pretrained("../input/codebert-base/codebert-base")
        #self.tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 50265)
        self.fts = fts

    def __getitem__(self, index):
        row = self.df.iloc[index]
        

        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.fts[row.id]["codes"]],
            add_special_tokens=True,
            max_length=23,
            padding="max_length",
            truncation=True
        )
        n_md = self.fts[row.id]["total_md"]
        n_code = self.fts[row.id]["total_md"]
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        assert len(ids) == self.total_max_len

        return ids, mask, fts, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]

In [11]:
model_path = "../input/codebert-base/codebert-base"
BS = 32
NW = 4
test_df["pct_rank"] = 0
MAX_LEN = 64
test_ds = MarkdownDataset(test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), md_max_len=64,total_max_len=512, model_name_or_path=model_path, fts=test_fts)
test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                              pin_memory=False, drop_last=False)

  cpuset_checked))


In [12]:
batch = next(iter(test_loader))

In [13]:
print(batch)

[tensor([[    0, 10431,  2741,  ...,     1,     1,     1],
        [    0, 48342, 25980,  ...,     1,     1,     1],
        [    0, 48342, 39154,  ...,     1,     1,     1],
        ...,
        [    0, 48342, 44457,  ...,     1,     1,     1],
        [    0,   170,    40,  ...,     1,     1,     1],
        [    0,   170,    67,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), tensor([[0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
        [0.5000],
   

In [14]:
from tqdm import tqdm
from collections import OrderedDict
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in tqdm(enumerate(tbar)):
            inputs, target = read_data(data)

            pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds)

def predict(model_path, ckpt_path):
    model = MarkdownModel(model_path)
    model = model.cuda()
    #model.load_state_dict(torch.load(ckpt_path))
    state_dict = torch.load(ckpt_path)
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:] # remove `module.`
        new_state_dict[name] = v
    # load params
    model.load_state_dict(new_state_dict, strict=False)
    BS = 32
    NW = 4
    MAX_LEN = 64
    test_df["pct_rank"] = 0
    print(model)
    test_ds = MarkdownDataset(test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), md_max_len=64,total_max_len=512, model_name_or_path=model_path, fts=test_fts)
    test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                              pin_memory=False, drop_last=False)
    _, y_test = validate(model, test_loader)
    return y_test

In [15]:
model = MarkdownModel("../input/codebert-base/codebert-base/")

ckpt_path = "../input/ai4code-models/model-0 (1).bin"
state_dict = torch.load(ckpt_path)
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v
    # load params
    
model.load_state_dict(new_state_dict, strict=False)

_IncompatibleKeys(missing_keys=['model.pooler.dense.weight', 'model.pooler.dense.bias'], unexpected_keys=[])

In [16]:
#model.model.embeddings.position_embeddings.weight

In [17]:
#roberta = AutoModel.from_pretrained("../input/roberta-base")


In [18]:
#model.model.embeddings

In [19]:
#roberta.embeddings.position_embeddings.weight

In [20]:
code_bert = AutoModel.from_pretrained("../input/codebert-base/codebert-base")


In [21]:
#code_bert.embeddings.position_embeddings.weight

In [22]:
#longformer_model = LongformerModel.from_pretrained("../input/allenailongformerbase4096/longformer")

In [23]:
model_path = "../input/codebert-base/codebert-base/"
ckpt_path = "../input/ai4code-models/model-0 (1).bin"
y_test_1 = predict(model_path, ckpt_path)


MarkdownModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,


0it [00:00, ?it/s][A
1it [00:02,  2.31s/it][A

 50%|█████     | 1/2 [00:02<00:02,  2.32s/it]


2it [00:02,  1.06s/it][A

100%|██████████| 2/2 [00:02<00:00,  1.38s/it]

2it [00:02,  1.38s/it]







In [24]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test_1

In [25]:
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

Unnamed: 0,id,cell_order
0,0009d135ece78d,0a226b6a ddfd239c 8cb8d28a c6cd22db e25aa9bd 1372ae9b ba55e576 90ed07ab f9893819 7f388a41 39e937ec 2843a25a 06dbf8cf
1,0010483c12ba9b,7f270e34 54c7cab3 fe66203e 7844d5f8 5ce8863c 4a0777c4 4703bb6d 4a32c095 865ad516 02a0be6d
2,0010a919d60e4f,23607d04 b7578789 aafc3d23 bbff12d4 80e077ec b190ebb4 584f6568 8ce62db4 d3f5c397 5115ebe5 ed415c3c 5e8c5e7e 7f53de45...
3,0028856e09c5b7,eb293dfc 012c9d02 d22526d1 3ae7ece3


In [26]:
sub_df.to_csv("submission.csv", index=False)