# Imports

In [42]:
# files and paths
import json
from pathlib import Path
import zipfile
import os

# numbers and data
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm.notebook import tqdm

# pandas ops
from pandas.testing import assert_frame_equal

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

# data_dir = Path('../input/AI4Code')

# The Data

We need to thoroughly explore the data and understand it.

## Dir Paths

Let's first explore the data directories and paths.

In [3]:
DATA_DIR = os.path.join(".", "datasets")
EXTRACTED = True

zip_path = os.path.join(DATA_DIR, "AI4Code.zip")
data_path = os.path.join(DATA_DIR, "AI4Code")
if not EXTRACTED:
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(data_path)

Now let's convert the AI4Code directory to a Path object.

In [4]:
data_root = Path(data_path)
data_root

PosixPath('datasets/AI4Code')

In [5]:
def print_iterdir_indented(root):
    """
    Takes a PosixPath object and prints all sub-dirs and 
    sub-files in a concise manner.
    """
    print(root.resolve(), end="\n\n")
    for curr_dir, sub_dirs, sub_files in os.walk(root):
        n_indents = len(Path(curr_dir).parts) - len(root.parts)
        print("    " * n_indents, Path(curr_dir).name + os.sep) # os.sep is "/"
        for i, f in enumerate(sorted(sub_files)):
            if i > 2:
                print("    " * (n_indents+1), "...")
                break
            print("    " * (n_indents+1), f)

In [6]:
print_iterdir_indented(data_root)

/Users/calvinhuang/ml/learning/datasets/AI4Code

 AI4Code/
     sample_submission.csv
     train_ancestors.csv
     train_orders.csv
     test/
         0009d135ece78d.json
         0010483c12ba9b.json
         0010a919d60e4f.json
         ...
     train/
         00001756c60be8.json
         00015c83e2717b.json
         0001bdd4021779.json
         ...


## To DataFrame

Read the json files and convert to dataframes.

In [7]:
MAX_TRAIN_LEN = 10000

train_root = data_root / "train"
train_paths = list(train_root.glob("*.json"))
len(train_paths)

139256

Let's explore the data using a subset for speed.

In [8]:
train_paths = train_paths[:MAX_TRAIN_LEN]

Create a function to print the json contents.

In [9]:
def read_json(path):
    with open(path, mode='r') as stream:
        parsed = json.load(stream)
        print(json.dumps(parsed, indent=4))
# read_json(train_paths[0])

Create a function to create a pandas df from the json.

In [10]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

print(train_paths[0])
read_notebook(train_paths[0])

datasets/AI4Code/train/8f1d0a3e812ccb.json


Unnamed: 0_level_0,cell_type,source,id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
02373f0f,code,import numpy as np \nimport pandas as pd\nimport os\nimport matplotlib.pyplot as plt\nimport seaborn as sns,8f1d0a3e812ccb
7eee6f0d,code,# training data\ntrain_dts = pd.read_csv('../input/titanic/train.csv')\ntrain_dts.head(),8f1d0a3e812ccb
e38ca82d,code,# test data\ntest_dts = pd.read_csv('../input/titanic/test.csv')\ntest_dts.head(),8f1d0a3e812ccb
9376e7b4,code,female = train_dts.loc[train_dts.Sex=='female']['Survived']\nprint('% of Female survived : {:.3f}'.format((sum(femal...,8f1d0a3e812ccb
dfdb7c2c,code,print('Shape of Training Set : {}'.format(train_dts.shape))\nprint('Number of training data points : {}\n'.format(le...,8f1d0a3e812ccb
...,...,...,...
88243368,markdown,after the final processing of features we are remaning with 30 features,8f1d0a3e812ccb
604f6956,markdown,Calculating Survival rate of Male and Female on training set,8f1d0a3e812ccb
0562b43b,markdown,creating titles from names of passengers,8f1d0a3e812ccb
4f0494f1,markdown,can be seen clearly that female has much larger probablity of surviving then male,8f1d0a3e812ccb


Now let's read all jsons to pandas dfs.

In [11]:
notebooks_train = [
    read_notebook(path) for path in tqdm(train_paths, desc='Read train_set')
]

Read train_set:   0%|          | 0/10000 [00:00<?, ?it/s]

In [24]:
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
0002115f48f982,18281c6c,code,"import numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport ..."
0002115f48f982,e3b6b115,code,df = pd.read_csv('../input/metadata_train.csv')\ndf.info()
0002115f48f982,4a044c54,code,df.head()
0002115f48f982,365fe576,code,#let's check if targets are consistent within the same measurement id\ntargets = df.groupby('id_measurement')[['targ...
0002115f48f982,a3188e54,code,"sns.countplot(x='target',data=targets)\n# it should be only ""1"" and ""0"" but we have cases where target is not consit..."
...,...,...,...
ffe8d0aa5e7d68,b9f0782a,markdown,### Modelling
ffe8d0aa5e7d68,3492f280,markdown,"Now one of the important step in this task, we know that this is a regression task, previously we seen skewness in o..."
ffe8d0aa5e7d68,eea09e6e,markdown,"We will made our regression model, now we have come to end our task, in this section we will capture some parameter ..."
ffe8d0aa5e7d68,54ffd613,markdown,### Ridge


## Get Ranks of Cells

Our target is to correctly rank the code and markdown cells. We are given the code cells in correct order (rank 0 to rank n), but the markdown cells following the code cells are in random shuffled order.

The correct orders for the training data are in `train_orders.csv`

In [22]:
df_orders = pd.read_csv(data_root / "train_orders.csv",
                        index_col="id",
                       ).squeeze("columns").str.split()
df_orders

id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b763d, 77e56113, 2eefe0ef, 1ae087ab, 0beab1cd, 8ffe0b25, 9a78ab76, 0d136...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c417225b, 51e3cd89, 2600b4eb, 75b65993, cf195f8b, 25699d02, 72b3201a, f2c75...
0001bdd4021779    [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310c80, 073e27e5, 015d52a4, ad7679ef, 7fde4f04, 07c52510, 0a1a7a39, 0bcd3...
0001daf4c2c76d    [97266564, a898e555, 86605076, 76cc2642, ef279279, df6c939f, 2476da96, 00f87d0a, ae93e8e6, 58aadb1d, d20b0094, 986fd...
0002115f48f982                                 [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe576, a3188e54, b3f6e12d, ee7655ca, 84125b7a]
                                                                           ...                                                           
fffc30d5a0bc46    [09727c0c, ff1ea6a0, ddfef603, a01ce9b3, 3ba953ee, bf92a015, f4a0492a, 095812e6, 53125cfe, aa32a700, 63340e73, 06d8c...
fffc3b44869198    [978a5137, fa

Let's get an example notebook to apply unit tests to.

In [84]:
nb_id = df.index.unique("id")[6] # 7th notebook out of the 10K unique notebooks we got
print('Notebook:', nb_id)

print("The disordered notebook:")
nb = df.loc[nb_id]
with pd.option_context('display.max_rows', None):
    display(nb)

Notebook: 0030ea6c6281ce
The disordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
b8ff09de,code,# import all packages and set plots to be embedded inline\nimport numpy as np\nimport pandas as pd\nimport matplotli...
532dd206,code,"# load in the dataset into a pandas dataframe, print statistics\nData = pd.read_csv('../input/prosperloandata/prospe..."
6d1c9755,code,# high-level overview of data shape and composition\nprint(Data.shape)
1d0804d1,code,"features = ['LoanOriginalAmount', 'BorrowerAPR', 'BorrowerRate','StatedMonthlyIncome', 'Term', 'ProsperRating (Alpha..."
0aa598c5,code,selected_data.head()
ca7507bb,code,"sns.displot(\n data=selected_data.isna().melt(value_name=""missing""),\n y=""variable"",\n hue=""missing"",\n ..."
c0bc83de,code,# descriptive statistics for numeric variables\nselected_data.describe()
c74e572f,code,"# Convert ProsperRating to an ordered type\nrate_order = ['HR','E','D','C','B','A','AA']\nordered_var = pd.api.types..."
84f0f7b5,code,"\n# Convert Employment status to an ordered type\nemp_order = ['Employed','Self-employed','Full-time','Part-time','..."
f6aca3f0,code,# The `color_palette()` returns the the current / default palette as a list of RGB tuples. \n# Each tuple consists o...


Let's get the correct order of this particular notebook.

In [85]:
cell_order = df_orders.loc[nb_id]

print("The ordered notebook:")
with pd.option_context("display.max_rows", None):
    display(nb.loc[cell_order])

The ordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3dc29916,markdown,# Loan Data from Prosper\n\n<br>\n<center><b>By Abdelrhman Ragab Nady</b></center>\n
3afb27cc,markdown,"\n## Preliminary Wrangling\n\n>This data set contains 113,937 loans with 81 variables on each loan, including loan a..."
b8ff09de,code,# import all packages and set plots to be embedded inline\nimport numpy as np\nimport pandas as pd\nimport matplotli...
532dd206,code,"# load in the dataset into a pandas dataframe, print statistics\nData = pd.read_csv('../input/prosperloandata/prospe..."
6d1c9755,code,# high-level overview of data shape and composition\nprint(Data.shape)
1d0804d1,code,"features = ['LoanOriginalAmount', 'BorrowerAPR', 'BorrowerRate','StatedMonthlyIncome', 'Term', 'ProsperRating (Alpha..."
0aa598c5,code,selected_data.head()
42586e3f,markdown,"### What is the structure of your dataset?\n\n>The dataset has 113,937 loans with 81 variables on each loan. I will ..."
6fcdd914,markdown,### What features in the dataset do you think will help support your investigation into your feature(s) of interest?...
699baeea,markdown,**How Many Missing in the Data Set?**


Our goal is to have a target feature that represents this ordering - a rank value that, when ordered, is ordinal from 0 to n-1 (n is the number of cells).

In [86]:
def get_ranks(ordered_cells, unordered_cells):
    return [ordered_cells.index(cell_id) 
            for cell_id in unordered_cells]

cell_ranks = get_ranks(cell_order, nb.index)
nb.insert(0, "rank", cell_ranks)
with pd.option_context('display.max_rows', None):
    display(nb)

Unnamed: 0_level_0,rank,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
b8ff09de,2,code,# import all packages and set plots to be embedded inline\nimport numpy as np\nimport pandas as pd\nimport matplotli...
532dd206,3,code,"# load in the dataset into a pandas dataframe, print statistics\nData = pd.read_csv('../input/prosperloandata/prospe..."
6d1c9755,4,code,# high-level overview of data shape and composition\nprint(Data.shape)
1d0804d1,5,code,"features = ['LoanOriginalAmount', 'BorrowerAPR', 'BorrowerRate','StatedMonthlyIncome', 'Term', 'ProsperRating (Alpha..."
0aa598c5,6,code,selected_data.head()
ca7507bb,10,code,"sns.displot(\n data=selected_data.isna().melt(value_name=""missing""),\n y=""variable"",\n hue=""missing"",\n ..."
c0bc83de,12,code,# descriptive statistics for numeric variables\nselected_data.describe()
c74e572f,14,code,"# Convert ProsperRating to an ordered type\nrate_order = ['HR','E','D','C','B','A','AA']\nordered_var = pd.api.types..."
84f0f7b5,15,code,"\n# Convert Employment status to an ordered type\nemp_order = ['Employed','Self-employed','Full-time','Part-time','..."
f6aca3f0,17,code,# The `color_palette()` returns the the current / default palette as a list of RGB tuples. \n# Each tuple consists o...


Sorting by the rank value is the same as ordering the notebook.

In [76]:
assert_frame_equal(nb.loc[cell_order], nb.sort_values("rank"))

Let's create our target df - the ranks for each notebook's cells.

In [115]:
# df with index id and cols cell_order, cell_id
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

# dict with id: cell_id, rank
ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {"cell_id": cell_id, "rank": get_ranks(cell_order, cell_id)}
    
# create a df for rank
df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient="index")
    .rename_axis("id")
    .apply(pd.Series.explode)
    .set_index("cell_id", append=True)
)
df_ranks

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
id,cell_id,Unnamed: 2_level_1
0002115f48f982,18281c6c,1
0002115f48f982,e3b6b115,2
0002115f48f982,4a044c54,3
0002115f48f982,365fe576,4
0002115f48f982,a3188e54,5
...,...,...
ffe8d0aa5e7d68,b9f0782a,54
ffe8d0aa5e7d68,3492f280,45
ffe8d0aa5e7d68,eea09e6e,59
ffe8d0aa5e7d68,54ffd613,68


## Grouping by Ancestor

In [119]:
df_ancestors = pd.read_csv(data_root / "train_ancestors.csv", index_col="id")
df_ancestors

Unnamed: 0_level_0,ancestor_id,parent_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00001756c60be8,945aea18,
00015c83e2717b,aa2da37e,317b65d12af9df
0001bdd4021779,a7711fde,
0001daf4c2c76d,090152ca,
0002115f48f982,272b483a,
...,...,...
fffc30d5a0bc46,6aed207b,
fffc3b44869198,a6aaa8d7,
fffc63ff750064,0a1b5b65,
fffcd063cda949,d971e960,


In [126]:
df_ancestors.loc[:, "ancestor_id"].unique()

array(['945aea18', 'aa2da37e', 'a7711fde', ..., '0a1b5b65', 'd971e960',
       '3c40bfa6'], dtype=object)

To get a split without leakage (let's say notebooks in the valid set have a common ancestor with notebooks in train set), we need to split each notebook by their ancestors.

In [137]:
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=42)

# split by groups aka ancestors
ids = df.index.unique("id")
ancestors = df_ancestors.loc[ids, "ancestor_id"]
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train, df_valid = df.loc[ids_train], df.loc[ids_valid]

## Feature Engineering

We want to extract info from the source. Let's use a tf-idf vectorizer, which will evaluate the relative frequency of words in each cell to all cells.

In [160]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=0.01)
X_train = tfidf.fit_transform(df_train.loc[:, "source"].astype(str))
y_train = df_ranks.loc[ids_train].to_numpy()
groups = df_ranks.loc[ids_train].groupby("id").size().to_numpy()

In [161]:
X_train.shape, y_train.shape, groups.shape

((408191, 280), (408191, 1), (8998,))

We want to have the rank values of the code cells, which are ordered, and 0 for markdown cells, which are randomly shuffled.

In [165]:
X_train = sparse.hstack((
    X_train,
    np.where(
        df_train['cell_type'] == 'code',
        df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))
X_train.shape

(408191, 281)

# The Model

Use XGBRanker.

In [167]:
from xgboost import XGBRanker

model = XGBRanker(
    min_child_weight=10,
    subsample=0.5,
    tree_method='hist',
)
model.fit(X_train, y_train, group=groups)

XGBRanker(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None,
          enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1,
          grow_policy='depthwise', importance_type=None,
          interaction_constraints='', learning_rate=0.300000012, max_bin=256,
          max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
          min_child_weight=10, missing=nan, monotone_constraints='()',
          n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto',
          random_state=0, reg_alpha=0, reg_lambda=1, ...)

# Evaluate

Now we need to evaluate the model's performance on our validation data, since the test set does not have target data available.

In [168]:
# Validation set
X_valid = tfidf.transform(df_valid['source'].astype(str))
# The metric uses cell ids
y_valid = df_orders.loc[ids_valid]

X_valid = sparse.hstack((
    X_valid,
    np.where(
        df_valid['cell_type'] == 'code',
        df_valid.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [169]:
X_valid.shape, y_valid.shape

((45103, 281), (1002,))

What the model outputs is the ranks of the cells, we need to process so that our y_pred is a list of cell ids.

In [188]:
y_pred = pd.DataFrame({'rank': model.predict(X_valid)}, index=df_valid.index)

# convert from id, cell_id: rank to id: list of cell_ids in correct order
y_pred = (
    y_pred
    .sort_values(["id", "rank"])
    .reset_index("cell_id")
    .groupby("id")["cell_id"].apply(list)
)
y_pred.shape

(1002,)

Examine a notebook.

In [190]:
nb_id = df_valid.index.unique("id")[8]

nb = df.loc[nb_id]
display(nb)
display(nb.loc[y_pred.loc[nb_id]])

Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
d28db502,code,from math import sqrt\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom sklearn.metric...
79dcd8fe,code,file = '../input/solarpanelspower/PV_Elec_Gas3.csv'\n#../input/solarpanelspower/PV_Elec_Gas3.csv\ndf = pd.read_csv(f...
0e4aa042,code,# make a column with the daily power (stationary)\nsolarpower['day_power']=0.0\nfor index in range(solarpower.index[...
787fce34,code,plt.plot(solarpower.day_power)\nplt.xticks(color='aqua')\nplt.yticks(color='aqua')\nplt.show()
c24871ae,code,'''simple exponential smoothing go back to last N values\n y_t = a * y_t + a * (1-a)^1 * y_t-1 + a * (1-a)^2 * y_t-2...
db97fa2c,code,# The optimum alpha is 0.2\n'''simple exponential smoothing go back to last N values\n y_t = a * y_t + a * (1-a)^1 *...
ac4fc8c1,code,"plt.figure(figsize=(15,7))\nplt.plot(solarpower.day_power[:730])\nplt.plot(solarpower.smooth_power[:730])\nplt.xtick..."
ac0f6c52,code,solarpower= solarpower.dropna()
dc28d34e,code,"#signal autocorrellation on day_power\nx = np.array(solarpower.day_power[:].values)\nacf = []\nfor i in range(1, len..."
2b30518b,code,#stats autocorrellation day_power \ny = np.array(solarpower.day_power[:].values)\n\nyunbiased = y-np.mean(y)\nynorm ...


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
d28db502,code,from math import sqrt\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom sklearn.metric...
79dcd8fe,code,file = '../input/solarpanelspower/PV_Elec_Gas3.csv'\n#../input/solarpanelspower/PV_Elec_Gas3.csv\ndf = pd.read_csv(f...
0e4aa042,code,# make a column with the daily power (stationary)\nsolarpower['day_power']=0.0\nfor index in range(solarpower.index[...
787fce34,code,plt.plot(solarpower.day_power)\nplt.xticks(color='aqua')\nplt.yticks(color='aqua')\nplt.show()
ac4fc8c1,code,"plt.figure(figsize=(15,7))\nplt.plot(solarpower.day_power[:730])\nplt.plot(solarpower.smooth_power[:730])\nplt.xtick..."
c24871ae,code,'''simple exponential smoothing go back to last N values\n y_t = a * y_t + a * (1-a)^1 * y_t-1 + a * (1-a)^2 * y_t-2...
db97fa2c,code,# The optimum alpha is 0.2\n'''simple exponential smoothing go back to last N values\n y_t = a * y_t + a * (1-a)^1 *...
ac0f6c52,code,solarpower= solarpower.dropna()
c8aef0d7,markdown,Exploring the data of solarpower
2b30518b,code,#stats autocorrellation day_power \ny = np.array(solarpower.day_power[:].values)\n\nyunbiased = y-np.mean(y)\nynorm ...


## Evaluation Metric

The competition uses the Kendall tau correlation.