#AI5Code: Inference

In [None]:
import xgboost as xgb
import pandas as pd
from pathlib import Path
from tqdm import tqdm
data_dir = Path('../input/AI4Code')

In [None]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

In [None]:
paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
df_test = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

In [None]:
import os
import re

import yaml
import scipy
import pandas as pd

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

def sub_html_tags(text):
    cleared_text = re.sub(r"<.*?>", "", text)
    return cleared_text


def sub_latex_math(text):
    regex = r"(\$+)(?:(?!\1)[\s\S])*\1"
    regex1 = r"\\begin.*?\\end{.*?}"
    regex2 = r"\\[a-zA-Z]+"

    # Some text have \b who ident as \x08 this need change
    cleared_text = text.replace("\b", "\\b")
    cleared_text = cleared_text.replace("\n", "")

    cleared_text = re.sub(regex, "", cleared_text)
    cleared_text = re.sub(regex1, "", cleared_text)
    cleared_text = re.sub(regex2, "", cleared_text)
    return cleared_text


def sub_links(text):
    cleared_text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text)
    return cleared_text


def sub_email(text):
    cleared_text = re.sub(r'\S*@\S*\s?', "", text)
    return cleared_text


def preprocess_text(text):
    # Remove all the special characters
    text = re.sub(r"\W", " ", str(text))

    # remove all single characters
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)

    # Remove single characters from the start
    text = re.sub(r"\^[a-zA-Z]\s+", " ", text)

    # Substituting multiple spaces with single space
    text = re.sub(r"\s+", " ", text, flags=re.I)

    # Removing prefixed 'b'
    text = re.sub(r"^b\s+", "", text)

    # Converting to Lowercase
    text = text.lower()

    # remove digits
    text = re.sub(r"[0-9]+", "", text)
    return text


def sub_all(text):
    text = sub_html_tags(text)
    text = sub_latex_math(text)
    text = sub_links(text)
    text = sub_email(text)
    text = preprocess_text(text)
    return text


In [None]:
 df_test.loc[data.cell_type == "markdown", 'source'] = df_test.loc[data.cell_type == "markdown", 'source'].parallel_apply(sub_all)
   

In [None]:
import pickle

with open('../input/xgbrankerai4code/tfidf.pkl', "rb") as file:
        tfidf = pickle.load(file)

In [None]:
X_test = tfidf.transform(df_test['source'].astype(str))
X_test = sparse.hstack((
    X_test,
    np.where(
        df_test['cell_type'] == 'code',
        df_test.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [None]:
y_infer = pd.DataFrame({'rank': model.predict(X_test)}, index=df_test.index)
y_infer = y_infer.sort_values(['id', 'rank']).reset_index('cell_id').groupby('id')['cell_id'].apply(list)
y_infer

In [None]:
y_sample = pd.read_csv(data_dir / 'sample_submission.csv', index_col='id', squeeze=True)
y_sample

In [None]:
y_submit = (
    y_infer
    .apply(' '.join)  # list of ids -> string of ids
    .rename_axis('id')
    .rename('cell_order')
)
y_submit

In [None]:
y_submit.to_csv('submission.csv')