In [1]:
import pandas as pd
import numpy as np
import random
import gc
import time
import shutil
import re
import os
from tqdm import tqdm
import glob
# from unidecode import unidecode



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR

In [None]:
# # from https://www.kaggle.com/code/ilyaryabov/fastttext-sorting-with-cosine-distance-algo
# import re
# from nltk.stem import WordNetLemmatizer

## Functions

In [3]:
class Parameter(object):
    def __init__(self):
        # data
        self.result_dir = './user_data/'
        self.result_dir_1 = './kaggle/'
        self.data_dir = '../input/AI4Code/'
        self.model_dir = './models/'
        self.k_folds = 5
        self.random_seed = 27
        self.seq_length = 512 # 512; sum of "cell length" for a particular id.
        self.cell_max_length = 128 # max length of a cell.
        self.cell_count = 128        
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        # model
        self.use_cuda = torch.cuda.is_available()
        self.gpu = 0
        self.print_freq = 500
        self.weight_decay = 0
        self.optim = 'Adam'

    def get(self, name):
        return getattr(self, name)

    def set(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

    def __str__(self):
        return '\n'.join(['%s:%s' % item for item in self.__dict__.items()])

In [4]:
parameter = Parameter()

In [5]:
class KFold(object):
    """
    KFold: Group split by group_col or random_split
    """

    def __init__(self, random_seed, k_folds=10, flag_name='fold_flag'):
        self.k_folds = k_folds
        self.flag_name = flag_name
        np.random.seed(random_seed)

    def group_split(self, train_df, group_col):
        group_value = list(set(train_df[group_col]))
            # type(group_value) => <class 'list'>
            
        # operations over list can be applied directly without assigning.
        group_value.sort()        
        fold_flag = [i % self.k_folds for i in range(len(group_value))]
            # [i % 5 for i in range(50)]
            # [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, ....]
        np.random.shuffle(fold_flag)
        
        train_df = train_df.merge(pd.DataFrame({group_col: group_value, self.flag_name: fold_flag}), how='left',
                                  on=group_col)
        return train_df

    def random_split(self, train_df):
        fold_flag = [i % self.k_folds for i in range(len(train_df))]
        np.random.shuffle(fold_flag)
        train_df[self.flag_name] = fold_flag
        return train_df

    def stratified_split(self, train_df, group_col):
        train_df[self.flag_name] = 1
        train_df[self.flag_name] = train_df.groupby(by=[group_col])[self.flag_name].rank(ascending=True,
                                                                                         method='first').astype(int)
        train_df[self.flag_name] = train_df[self.flag_name].sample(frac=1.0).reset_index(drop=True)
        train_df[self.flag_name] = train_df[self.flag_name] % self.k_folds
        return train_df


In [6]:
def read_json_data(mode='train'):
    paths_train = sorted(list(glob.glob(parameter.data_dir + '{}/*.json'.format(mode))))#[:100]
    res = pd.concat([
        pd.read_json(path, dtype={'cell_type': 'category', 'source': 'str'}).assign(
            id=path.split('/')[-1].split('.')[0]).rename_axis('cell_id')
            # .assign() => create new column ['id'] with values equal to file name after removing extension ".json".
            # .rename_axis() => provide name to index as ['cell_id'].
        for path in tqdm(paths_train)]).reset_index(drop=False)
            # .reset_index(drop=False) => converts the named index ['cell_id'] to column, then resets the index to numbering system.
    res = res[['id', 'cell_id', 'cell_type', 'source']]
    return res

In [7]:
def preprocess_text(document):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(document))
    document = document.replace('_', ' ')

    # matches any sequence of one or more whitespace characters followed by a single letter (uppercase or lowercase) followed by one or more whitespace characters.
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    #         # Remove single characters from the start
    #         document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substituting multiple spaces with single space
    # flags=re.I => argument specifies that the search should be case-insensitive.
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    #         # Removing prefixed 'b'
    #         document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()
    # return document

    #         # Lemmatization
    #         tokens = document.split()
    #         tokens = [stemmer.lemmatize(word) for word in tokens]
    #         # tokens = [word for word in tokens if len(word) > 3]

    #         preprocessed_text = ' '.join(tokens)
    return document

In [8]:
def preprocess_df(df):
    # count ['cell_id'] under each under each ['id']    
    df['cell_count'] = df.groupby(by=['id'])['cell_id'].transform('count')
    
    # df['source'] = df['cell_type'] + ' ' + df['source']
    df['cell_type'] = df['cell_type'].map({'code': 0, 'markdown': 1}).fillna(0).astype(int)
    # df.loc[df['cell_type']==0, 'source'] = df.loc[df['cell_type']==0, 'rank'] + ' ' + df.loc[df['cell_type']==0, 'source']
    
    # count markdown cell
    df['markdown_count'] = df.groupby(by=['id'])['cell_type'].transform('sum')
    
    # count code cell
    df['code_count'] = df['cell_count'] - df['markdown_count']
    
    # normalize rank by corresponding ['cell_count']
    df['rank'] = df['rank'] / df['cell_count']
    
    # .strip() => leading and trailing whitespace removed
    df['source'] = df['source'].apply(lambda x: x.lower().strip())
    df['source'] = df['source'].apply(lambda x: preprocess_text(x))
    # df['source'] = df['source'].replace("\\n", "\n")
    # df['source'] = df['source'].str.replace("\n", "")
    df['source'] = df['source'].str.replace("[SEP]", "")
    df['source'] = df['source'].str.replace("[CLS]", "")

    # df['source'] = df['source'].replace("#", "")
    # df['source'] = df['source'].apply(lambda x: unidecode(x))
    
    # replaces one or more consecutive spaces in the string x with a single space.    
    df['source'] = df['source'].apply(lambda x: re.sub(' +', ' ', str(x)))
    return df

In [9]:
# def get_truncated_df(df, cell_count=128, id_col='id2', group_col='id', max_random_cnt=500, expand_ratio=10):
#     tmp1 = df[df['cell_count'] <= cell_count].reset_index(drop=True)
#     tmp1.loc[:, id_col] = 0
#     tmp2 = df[df['cell_count'] > cell_count].reset_index(drop=True)
#     # print(tmp1.shape,tmp2.shape)
#     res = [tmp1]
#     for _, df_g in tmp2.groupby(by=group_col):
#         # print(df_g.columns)
#         df_g = df_g.sample(frac=1.0).reset_index(drop=True)
#         # index_list = range(len(df_g))
#         step = min(cell_count // 2, len(df_g) - cell_count)
#         step = max(step, 1)
#         id_col_count = 0
#         for i in range(0, len(df_g), step):
#             # indexes = [i] + list(np.random.choice([j for j in index_list if j!=i],cell_count-1, replace=False))
#             # indexes = range(i,i+cell_count)
#             # print(indexes,i,len(df_g),index_list)
#             res_tmp = df_g.iloc[i:i + cell_count]  # .copy()
#             # if len(res_tmp) == cell_count:
#             res_tmp.loc[:, id_col] = id_col_count
#             id_col_count += 1
#             res.append(res_tmp)
#
#         random_cnt = int(len(df_g) // cell_count * expand_ratio)
#         random_cnt = min(random_cnt, max_random_cnt)  # todo
#         if random_cnt > 0:
#             for i in range(random_cnt):
#                 res_tmp = df_g.sample(n=cell_count).reset_index(drop=True)
#                 res_tmp.loc[:, id_col] = id_col_count
#                 id_col_count += 1
#                 res.append(res_tmp)
#
#     res = pd.concat(res).reset_index(drop=True)
#     sort_flag = range(len(res))
#     np.random.shuffle(sort_flag)
#     res.loc[res['cell_type'] == 0, 'sort_flag'] = 0
#     res = res.sort_values(by=['id', id_col, 'cell_type', 'rank'], ascending=True)
#     res = res.groupby(by=['id', id_col, 'fold_flag', 'cell_count'], as_index=False, sort=False)[
#         ['cell_id', 'cell_type', 'source', 'rank']].agg(list)
#     return res


# divide id's (whose cell_count is greater than 128) into batches of "size <= 128", and then append to original df.
def get_truncated_df(df, cell_count=128, id_col='id2', group_col='id', max_random_cnt=100, expand_ratio=5):
    tmp1 = df[df['cell_count'] <= cell_count].reset_index(drop=True)
    tmp1.loc[:, id_col] = 1
    tmp2 = df[df['cell_count'] > cell_count].reset_index(drop=True)
    # print(tmp1.shape,tmp2.shape)
    res = [tmp1]
    for _, df_g in tmp2.groupby(by=group_col):
            # type(df_g) => <class 'pandas.core.frame.DataFrame'>
            # df_g.columns => Index(['id', 'cell_id', 'cell_type', 'source', 'rank', 'ancestor_id',...
            # cell_count => 128
            
        df_g = df_g.sample(frac=1.0).reset_index(drop=True)
        step = min(cell_count // 2, len(df_g) - cell_count)
        step = max(step, 1)
            # cell_count // 2, len(df_g) => 64, 229
            # len(df_g) - cell_count => 101
            
        id_col_count = 1
        for i in range(0, len(df_g), step):# (0,229,64)
            res_tmp = df_g.iloc[i:i + cell_count]  # .copy()
            if len(res_tmp) != cell_count:
                res_tmp = df_g.iloc[-cell_count:] # pick 128 rows from tail.
            # if len(res_tmp) == cell_count:
            res_tmp.loc[:, id_col] = id_col_count
            id_col_count += 1
            res.append(res_tmp)
            if i + cell_count >= len(df_g):
                break

        if len(df_g) // cell_count > 1.3:# len(df_g) // cell_count => 1
            random_cnt = int(len(df_g) // cell_count * expand_ratio)
                # random_cnt = 10            
            random_cnt = min(random_cnt, max_random_cnt)  # todo

            for i in range(random_cnt):
                res_tmp = df_g.sample(n=cell_count).reset_index(drop=True)
                res_tmp.loc[:, id_col] = id_col_count
                id_col_count += 1
                res.append(res_tmp)

    res = pd.concat(res).reset_index(drop=True)
    res = res.sort_values(by=['id', id_col, 'cell_type', 'rank2'], ascending=True)
    res = res.groupby(by=['id', id_col, 'fold_flag', 'cell_count', 'markdown_count', 'code_count'], as_index=False,
                      sort=False)[
        ['cell_id', 'cell_type', 'source', 'rank', 'rank2']].agg(list)
    return res

## Pre-processing

In [10]:
def get_data(seed=parameter.random_seed, mode=0):
    if os.path.exists('./train_df.pkl'):
        train_df = pd.read_pickle('./train_df.pkl')
    else:
        train_df = read_json_data(mode='train')
        # train_orders; contains order of cells in a notebook/id.
        train_orders = pd.read_csv('../input/AI4Code/' + 'train_orders.csv')
        train_ancestors = pd.read_csv('../input/AI4Code/' + 'train_ancestors.csv')
        
        # converts 'cell_id's present in cell_order to list format.
        train_orders['cell_id'] = train_orders['cell_order'].str.split()        

        train_orders = train_orders.explode(column='cell_id')

        train_orders['flag'] = range(len(train_orders))
        train_orders['rank'] = train_orders.groupby(by=['id'])['flag'].rank(ascending=True, method='first').astype(int)
            # method='first' => ranks assigned in order they appear in the array.
        del train_orders['flag'], train_orders['cell_order']

        # train_df = preprocess_features(train_df)
        train_df = train_df.merge(train_orders, on=['id', 'cell_id'], how='left')
        train_df = train_df.merge(train_ancestors[['id', 'ancestor_id']], on=['id'], how='left')
        train_df.to_pickle('train_df.pkl')
    
    # create 'fold_flag' column over column 'ancestor_id'.
    train_df = KFold(seed, parameter.k_folds).group_split(train_df, group_col='ancestor_id')    
        # ancestor_id => ['0000585e', '00008df7', '0001831d', '00024696', '0002f785', '00045f09', '00055dd3', '00057f34', ... 
        # fold_flag => [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
    
    # train_df = preprocess_features(train_df)
    # train_df['source_length'] = train_df['source'].apply(len)
    # train_df['id_length'] = train_df.groupby(by=['id'])['source_length'].transform('sum')
    train_df = preprocess_df(train_df)
    train_df = pd.concat(
        [train_df[train_df['cell_type'] == 0], train_df[train_df['cell_type'] == 1].sample(frac=1.0)]).reset_index(
        drop=True)
        # frac=1.0 => 1.0 fraction of axis items to return. (1.0 means take all 100%).
        # cell_type == 1 for markdown.
        
    # rank2; describes the order of cell for a particular ['cell_type'].
    train_df['rank2'] = (train_df.groupby(by=['id', 'cell_type']).cumcount() + 1) / \
                        train_df.groupby(by=['id', 'cell_type'])['cell_id'].transform('count')
    train_df.loc[train_df['cell_type'] == 1, 'rank2'] = -1
    code_df_valid = train_df[train_df['cell_type'] == 0][['id', 'cell_id', 'rank2']].copy()

    #     for col in ['cell_count','markdown_count', 'code_count']:
    #         train_df[col] = (train_df[col] - train_df[col].mean())/ train_df[col].std()
    #         train_df[col] = np.clip(train_df[col].fillna(0.0), -3, 3)


    train_df = get_truncated_df(train_df, cell_count=parameter.cell_count)
    #     train_df['flag'] = train_df['cell_type'].apply(lambda x:np.sum(x))
    #     train_df = train_df[train_df['flag']>0]
    #     del train_df['flag']
    
#     print(train_df)
#     print(train_df.shape)
    return train_df, code_df_valid


In [None]:
# train_df, code_df_valid = get_data()
# train_df.to_parquet('train_df', compression = "gzip", index=False)
# code_df_valid.to_parquet('code_df_valid', compression = "gzip", index=False)

In [11]:
# import pandas as pd
# df = pd.read_parquet("train_df")