# AI4Code: merge data

## Imports

In [1]:
from pathlib import Path
import os

import json
import pandas as pd

import pyarrow.parquet as pq
import pyarrow as pa

from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

## Load data paths

In [2]:
NUM_TRAINS = None
DATA_DIR = Path("../../data/raw")
if NUM_TRAINS is not None:
    train_paths = list((DATA_DIR / "train").glob("*.json"))[:NUM_TRAINS]
else:
    train_paths = list((DATA_DIR / "train").glob("*.json"))

test_paths = list((DATA_DIR / "test").glob("*.json"))


## Processing functions

In [3]:
def read_notebook(glob_path):
    return (
        pd.read_json(glob_path, dtype={"cell_type": "category", "source": "str"})
        .assign(id=glob_path.stem)
        .rename_axis("cell_id")
    )


def merge_notebooks(notebooks_list):
    return (
        pd.concat(notebooks_list)
        .set_index("id", append=True)
        .swaplevel()
        .sort_index(level="id", sort_remaining=False)
    )


In [4]:
def load_example(id, is_train=True):
    """
    Helper for loading json file of a training example
    """
    filedir = "train" if is_train else "test"
    with open(f"../../data/raw/{filedir}/{id}.json") as f:
        example = json.load(f)
    return example


def get_example_df(example_id, train, ancestors):
    """
    Creates a pandas dataframe of the json cells and correct order.
    """
    cell_order = train.query("id == @example_id")["cell_order"].values[0]
    example_df = pd.DataFrame(load_example(example_id))
    example_df["id"] = example_id
    my_orders = {}
    for idx, c in enumerate(cell_order.split(" ")):
        my_orders[c] = idx
    example_df["order"] = example_df.index.map(my_orders)
    example_df.reset_index().rename(columns={"index": "cell"})

    example_df["ancestor_id"] = ancestors.query("id == @example_id")[
        "ancestor_id"
    ].values[0]
    example_df["parent_id"] = ancestors.query("id == @example_id")["parent_id"].values[
        0
    ]
    example_df = example_df.reset_index().rename(columns={"index": "cell"})
    example_df = example_df.sort_values("order").reset_index(drop=True)
    example_df["id"] = example_id
    col_order = [
        "id",
        "cell",
        "cell_type",
        "source",
        "order",
        "ancestor_id",
        "parent_id",
    ]
    example_df = example_df[col_order]
    return example_df


def combine_train():
    train = pd.read_csv("../../data/raw/train_orders.csv")
    ancestors = pd.read_csv("../../data/raw/train_ancestors.csv")

    # Get the list of json files
    train_jsons = os.listdir("../../data/raw/train/")
    print(f"There are {len(train_jsons)} training json files")

    all_ids = train["id"].unique()
    args = ((ids, train, ancestors) for ids in all_ids[1:])
    
    if os.path.isfile("../../data/preprocessed/train_all.csv"):
        os.remove("../../data/preprocessed/train_all.csv")
    
    df = get_example_df(all_ids[0], train, ancestors).reset_index(drop=True)
    df.to_csv("../../data/preprocessed/train_all.csv", index=False)

    for arg in tqdm(args, desc="Concat dataset", total=len(all_ids[1:])):
        df = get_example_df(*arg).reset_index(drop=True)
        df.to_csv("../../data/preprocessed/train_all.csv", mode="a", index=False, header=False)


In [5]:
combine_train()


There are 139256 training json files


Concat dataset: 100%|██████████| 139255/139255 [1:24:29<00:00, 27.47it/s]
