# **Google AI4Code – Understand Code in Python Notebooks**
##### Predict the relationship between code and comments

### **Setup**

In [1]:
import json
from pathlib import Path
import numpy as np 
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path("/home/linux/Workspace/AI4Code/")

### **Load Data**

In [2]:
NUM_TRAIN = 10000

def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

df

Train NBs: 100%|██████████| 10000/10000 [00:38<00:00, 261.69it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
001308991e0c5e,6c01d0d2,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...
001308991e0c5e,b8fd3a8c,code,import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n%matplotlib inline
001308991e0c5e,94d3d43a,code,df.dropna()\ndf.head()
001308991e0c5e,9ecece20,code,df.shape
001308991e0c5e,808d31ab,code,df.columns
...,...,...,...
fffc3b44869198,40e930ff,code,test['bookID']
fffc3b44869198,b1873cbb,code,"df = pd.DataFrame(np.nan, index=[0,1,2,3], columns=['A'])\ndf['bookID'] = test['bookID']\ndf['average_rating'] = pd...."
fffc3b44869198,76e0f2a7,code,df
fffc3b44869198,233d93b9,code,"df.to_csv('file_name.csv', index=False)"


In [3]:
nb_id = df.index.unique('id')[6]
print('Notebook: ', nb_id)

print("The disordered notebook: ")
nb = df.loc[nb_id, :]
display(nb)
print()

Notebook:  002b5d330ee1ec
The disordered notebook: 


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
cb79667a,code,import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns
06a365fd,code,train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
0b58ebf1,code,train_df.head()
72603136,code,train_df.shape
21184871,code,"train_df = train_df.drop(['Alley','PoolQC','Fence','MiscFeature'], axis=1)"
...,...,...
a14531e8,markdown,# Categorical Columns
6adf0356,markdown,# Numeric columns
6146526c,markdown,# Concatenate Test and Training data
2d7a8c99,markdown,# Missing data





### **Ordering the Cells**

In [4]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()

df_orders



  df_orders = pd.read_csv(


id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b763d, 77e56113, 2eefe0ef, 1ae087ab, 0beab1cd, 8ffe0b25, 9a78ab76, 0d136...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c417225b, 51e3cd89, 2600b4eb, 75b65993, cf195f8b, 25699d02, 72b3201a, f2c75...
0001bdd4021779    [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310c80, 073e27e5, 015d52a4, ad7679ef, 7fde4f04, 07c52510, 0a1a7a39, 0bcd3...
0001daf4c2c76d    [97266564, a898e555, 86605076, 76cc2642, ef279279, df6c939f, 2476da96, 00f87d0a, ae93e8e6, 58aadb1d, d20b0094, 986fd...
0002115f48f982                                 [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe576, a3188e54, b3f6e12d, ee7655ca, 84125b7a]
                                                                           ...                                                           
fffc30d5a0bc46    [09727c0c, ff1ea6a0, ddfef603, a01ce9b3, 3ba953ee, bf92a015, f4a0492a, 095812e6, 53125cfe, aa32a700, 63340e73, 06d8c...
fffc3b44869198    [978a5137, fa

In [5]:
cell_order = df_orders.loc[nb_id]

print("The ordered notebook:")
nb.loc[cell_order, :]

The ordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
cb79667a,code,import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns
8770a100,markdown,# Training Data
06a365fd,code,train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
0b58ebf1,code,train_df.head()
72603136,code,train_df.shape
...,...,...
c7db5522,code,len(y_test)
b89986d3,code,sample_submission_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
bc46c719,code,"sample_submission_df['SalePrice'] = y_test\nsample_submission_df.to_csv('submission.csv', index=False)"
95141f76,code,submission_df = pd.read_csv('submission.csv')


In [6]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]


cell_ranks = get_ranks(cell_order, list(nb.index))
nb.insert(0, 'rank', cell_ranks)

nb

Unnamed: 0_level_0,rank,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cb79667a,0,code,import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns
06a365fd,2,code,train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
0b58ebf1,3,code,train_df.head()
72603136,4,code,train_df.shape
21184871,6,code,"train_df = train_df.drop(['Alley','PoolQC','Fence','MiscFeature'], axis=1)"
...,...,...,...
a14531e8,91,markdown,# Categorical Columns
6adf0356,87,markdown,# Numeric columns
6146526c,82,markdown,# Concatenate Test and Training data
2d7a8c99,5,markdown,# Missing data


In [7]:
from pandas.testing import assert_frame_equal

assert_frame_equal(nb.loc[cell_order, :], nb.sort_values('rank'))

In [8]:
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df_ranks

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
id,cell_id,Unnamed: 2_level_1
001308991e0c5e,6c01d0d2,1
001308991e0c5e,b8fd3a8c,3
001308991e0c5e,94d3d43a,5
001308991e0c5e,9ecece20,6
001308991e0c5e,808d31ab,7
...,...,...
fffc3b44869198,40e930ff,20
fffc3b44869198,b1873cbb,21
fffc3b44869198,76e0f2a7,22
fffc3b44869198,233d93b9,23


### **Splits**

In [9]:
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors

Unnamed: 0_level_0,ancestor_id,parent_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00001756c60be8,945aea18,
00015c83e2717b,aa2da37e,317b65d12af9df
0001bdd4021779,a7711fde,
0001daf4c2c76d,090152ca,
0002115f48f982,272b483a,
...,...,...
fffc30d5a0bc46,6aed207b,
fffc3b44869198,a6aaa8d7,
fffc63ff750064,0a1b5b65,
fffcd063cda949,d971e960,


In [10]:
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

ids = df.index.unique('id')
ancestors = df_ancestors.loc[ids, 'ancestor_id']
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

### **Feature Engineering**

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=0.01)
X_train = tfidf.fit_transform(df_train['source'].astype(str))
y_train = df_ranks.loc[ids_train].to_numpy()
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy()

In [12]:
X_train = sparse.hstack((
    X_train,
    np.where(
        df_train['cell_type'] == 'code',
        df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))
print(X_train.shape)

(404851, 291)


### **Train**

In [13]:
from xgboost import XGBRanker

model = XGBRanker(
    min_child_weight=10,
    subsample=0.5,
    tree_method='hist',
)
model.fit(X_train, y_train, group=groups)

### **Evaluate**

#### **Validation Set**

In [14]:
X_valid = tfidf.transform(df_valid['source'].astype(str))
y_valid = df_orders.loc[ids_valid]

X_valid = sparse.hstack((
    X_valid,
    np.where(
        df_valid['cell_type'] == "code",
        df_valid.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [15]:
y_pred = pd.DataFrame({'rank': model.predict(X_valid)}, index=df_valid.index)
y_pred = (
    y_pred
    .sort_values(['id', 'rank'])
    .reset_index('cell_id')
    .groupby('id')['cell_id'].apply(list)
)
y_pred.head(10)

id
003926b12d6663    [a3c26e4b, d1e25146, 6c0aae3b, f09cddba, 27ed80be, 2cb34a99, c2b58f51, 394ba540, 73d2d86e, a25f090c, 7bac4bbd, 26d03...
0045f3301bb72e    [707b1c7f, 24f94f11, 6c197fa6, 80d7eb2a, 4313301c, 0c248991, dd99a582, 3df75aa4, 4240869b, f7164a1f, f8d9b2cf, 93bc6...
0055ad513fed6f    [229edbd1, 3063147e, 6065659c, d60d3a5a, d49fc3d9, 9f53b404, 9a3846c5, 5fcd0218, 5f6df0b6, 38a769c5, ca851498, 35ec0...
00fc8fe3f3daee    [c6778546, 62e8d610, 1fde2bfb, eb2c7a88, 1c0ee524, ca3a8c7b, 012db1fd, 655aa141, 801e80cf, 0b20993b, 667c2ffd, ee68e...
0147ab49cc2a8f    [3b577bcd, de194e03, a7547a1d, c8cb36b1, a490b4a5, 323269ca, 2eec255b, 9092f5a3, e033fd08, deedd97c, dc49a6b3, 3fe1d...
014ff654f9260c    [84ae0de6, 6e3b274f, 5d637275, 343d4b0a, 49fa1013, ea061a6d, 9898e036, d2829aec, c6689c36, 22af7e39, 95f736ba, 3451c...
017ffea4363ced    [43ead210, 3c78d54c, 12fc37f8, 24cab715, 00cb87a3, c1b28550, 8ad0c111, 4d0f65a6, 41c6d154, e2cf3918, 7d63cef5, ebd5b...
0192e704f7465e    [cf9fe075, 29

In [16]:
nb_id = df_valid.index.get_level_values('id').unique()[8]
display(df.loc[nb_id])
display(df.loc[nb_id].loc[y_pred.loc[nb_id]])

Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
91e413d5,code,# load modules\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n%matplotlib inline\nimport...
ce525c9c,code,"# display column limita\npd.set_option('display.max_columns',500)"
a955f27d,code,# load data\ntrain = pd.read_csv('../input/xente-challenge/training.csv')\nvalidation = pd.read_csv('../input/xente-...
f192fd9e,code,# checking the balance of the data\nprint('The number of Non-Frauds are: ' + str(train['FraudResult'].value_counts()...
0f667449,code,"# visualize category class\nsns.countplot(x='FraudResult', data=train)"
1c1f9689,code,"# SMOTE\n# oversampling\nfrom imblearn.over_sampling import SMOTE\n\ncount_class_0, count_class_1 = train.FraudResul..."
c694a158,code,"train_class_1_over = train_class_1.sample(count_class_0, replace=True)\ntrain_test_over = pd.concat([train_class_0, ..."
0bf73791,code,train1 = train_test_over
8ba81ec7,code,numeric_features = train.select_dtypes(include=[np.number])\nnumeric_features.columns
0628e51f,code,categorical_features = train.select_dtypes(include=[np.object])\ncategorical_features.columns


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
91e413d5,code,# load modules\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n%matplotlib inline\nimport...
ce525c9c,code,"# display column limita\npd.set_option('display.max_columns',500)"
a955f27d,code,# load data\ntrain = pd.read_csv('../input/xente-challenge/training.csv')\nvalidation = pd.read_csv('../input/xente-...
f192fd9e,code,# checking the balance of the data\nprint('The number of Non-Frauds are: ' + str(train['FraudResult'].value_counts()...
0f667449,code,"# visualize category class\nsns.countplot(x='FraudResult', data=train)"
aeed4596,markdown,EXPLORATORY DATA ANALYSIS
1c1f9689,code,"# SMOTE\n# oversampling\nfrom imblearn.over_sampling import SMOTE\n\ncount_class_0, count_class_1 = train.FraudResul..."
c694a158,code,"train_class_1_over = train_class_1.sample(count_class_0, replace=True)\ntrain_test_over = pd.concat([train_class_0, ..."
0bf73791,code,train1 = train_test_over
8ba81ec7,code,numeric_features = train.select_dtypes(include=[np.number])\nnumeric_features.columns


#### **Metric**

In [17]:
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [18]:
y_dummy = df_valid.reset_index('cell_id').groupby('id')['cell_id'].apply(list)
kendall_tau(y_valid, y_dummy)

0.415278693883941

In [19]:
kendall_tau(y_valid, y_pred)

0.6015841825781357

### **Submission**

In [20]:
paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
df_test = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

Test NBs: 100%|██████████| 4/4 [00:00<00:00, 219.71it/s]


In [21]:
X_test = tfidf.transform(df_test['source'].astype(str))
X_test = sparse.hstack((
    X_test,
    np.where(
        df_test['cell_type'] == 'code',
        df_test.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [22]:
y_infer = pd.DataFrame({'rank': model.predict(X_test)}, index=df_test.index)
y_infer = y_infer.sort_values(['id', 'rank']).reset_index(
    'cell_id').groupby('id')['cell_id'].apply(list)
y_infer

id
0009d135ece78d    [ddfd239c, c6cd22db, 1372ae9b, 7f388a41, 90ed07ab, 8cb8d28a, 2843a25a, f9893819, 06dbf8cf, 0a226b6a, ba55e576, 39e93...
0010483c12ba9b                       [54c7cab3, fe66203e, 7f270e34, 5ce8863c, 7844d5f8, 4a32c095, 4a0777c4, 02a0be6d, 865ad516, 4703bb6d]
0010a919d60e4f    [aafc3d23, b7578789, 80e077ec, b190ebb4, ed415c3c, 322850af, c069ed33, 868c4eae, 80433cf3, 5e8c5e7e, d2f722a5, 8ce62...
0028856e09c5b7                                                                                   [012c9d02, d22526d1, 3ae7ece3, eb293dfc]
Name: cell_id, dtype: object

In [23]:
y_sample = pd.read_csv(data_dir / 'sample_submission.csv',
                       index_col='id', squeeze=True)
y_sample



  y_sample = pd.read_csv(data_dir / 'sample_submission.csv',


id
0009d135ece78d       ddfd239c c6cd22db 1372ae9b 90ed07ab 7f388a41 2843a25a 06dbf8cf f9893819 ba55e576 39e937ec e25aa9bd 0a226b6a 8cb8d28a
0010483c12ba9b                                  54c7cab3 fe66203e 7844d5f8 5ce8863c 4a0777c4 4703bb6d 4a32c095 865ad516 02a0be6d 7f270e34
0010a919d60e4f    aafc3d23 80e077ec b190ebb4 ed415c3c 322850af c069ed33 868c4eae 80433cf3 bd8fbd76 0e2529e8 1345b8b2 cdae286f 4907b9ef...
0028856e09c5b7                                                                                        012c9d02 d22526d1 3ae7ece3 eb293dfc
Name: cell_order, dtype: object

In [24]:
y_submit = (
    y_infer
    .apply(' '.join)  # list of ids -> string of ids
    .rename_axis('id')
    .rename('cell_order')
)
y_submit

id
0009d135ece78d       ddfd239c c6cd22db 1372ae9b 7f388a41 90ed07ab 8cb8d28a 2843a25a f9893819 06dbf8cf 0a226b6a ba55e576 39e937ec e25aa9bd
0010483c12ba9b                                  54c7cab3 fe66203e 7f270e34 5ce8863c 7844d5f8 4a32c095 4a0777c4 02a0be6d 865ad516 4703bb6d
0010a919d60e4f    aafc3d23 b7578789 80e077ec b190ebb4 ed415c3c 322850af c069ed33 868c4eae 80433cf3 5e8c5e7e d2f722a5 8ce62db4 4ae17669...
0028856e09c5b7                                                                                        012c9d02 d22526d1 3ae7ece3 eb293dfc
Name: cell_order, dtype: object

In [25]:
y_submit.to_csv('submission.csv')