In [1]:
import os
import functools

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [51]:
class Config:
    NUM_FOLDS = 5
    RANDOM_SEED = 42
    TRAIN_ON_SUBSET = False

In [11]:
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
df_titles = pd.read_csv("./data/titles.csv")
df_train.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [12]:
df_train[["id", "anchor"]]

Unnamed: 0,id,anchor
0,37d61fd2272659b1,abatement
1,7b9652b17b68b7a4,abatement
2,36d72442aefd8232,abatement
3,5296b0c19e1ce60e,abatement
4,54c1e3b9184cb5b6,abatement
...,...,...
36468,8e1386cbefd7f245,wood article
36469,42d9e032d1cd3242,wood article
36470,208654ccb9e14fa3,wood article
36471,756ec035e694722b,wood article


In [20]:
df_bfp = pd.DataFrame({"id": ["a", "b", "c"], "val_preds": [1.0, 2.0, 3.0], "score": [1.15, 2.2, 3.15]})
df_dbv3l = pd.DataFrame({"id": ["a", "b", "c"], "val_preds": [1.1, 2.1, 3.1]})
df_dbl = pd.DataFrame({"id": ["a", "b", "c"], "val_preds": [0.9, 1.9, 2.9]})
df_dbxl = pd.DataFrame({"id": ["a", "b", "c"], "val_preds": [1.2, 2.2, 3.2]})

In [21]:
def rename_val_preds(df, cols_to_use, suffix):
    df = df[cols_to_use]
    df.rename(columns = {"val_preds": "val_preds"+suffix}, inplace=True)
    return df

In [25]:
df_list = [(df_bfp, "_bfp"), (df_dbv3l, "_dbv3l"), (df_dbl, "_dbl")]
df_list_renamed = []
for idx, (df, suffix) in enumerate(df_list):
    cols = ["id", "val_preds"]
    if idx == 0:
        cols.append("score")
        cols.append("kfold")
    df_list_renamed.append(rename_val_preds(df, cols, suffix))
df_all = functools.reduce(lambda x, y: pd.merge(left=x, right=y, on=["id"], how="inner"), df_list_renamed)

In [27]:
df_all

Unnamed: 0,id,val_preds_bfp,score,val_preds_dbv3l,val_preds_dbl
0,a,1.0,1.15,1.1,0.9
1,b,2.0,2.2,2.1,1.9
2,c,3.0,3.15,3.1,2.9


In [49]:
from sklearn.preprocessing import LabelEncoder

anchor_encoder = LabelEncoder()
df_train["anchor_map"] = anchor_encoder.fit_transform(df_train["anchor"])
# Score is not really a continuous value here as there are just five distinct values. But since it is float it needs to be converted
# to categorical value before we can perform stratified split on score
df_train["score_map"] = df_train["score"].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})

In [50]:
df_train

Unnamed: 0,id,anchor,target,context,score,anchor_map,score_map
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50,0,2
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,0,3
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,0,1
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50,0,2
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00,0,0
...,...,...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00,732,4
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50,732,2
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50,732,2
36471,756ec035e694722b,wood article,wooden material,B44,0.75,732,3


In [52]:
from sklearn import model_selection

def strat_group_kfold_dataframe(df, target_col_name, group_col_name, num_folds=Config.NUM_FOLDS):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values    
    groups = df[group_col_name].values
    # stratify data using anchor as group and score as target
    skf = model_selection.StratifiedGroupKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_SEED)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y, groups=groups)):
        df.loc[val_index, "kfold"] = fold        
    return df     

In [53]:
if Config.TRAIN_ON_SUBSET:
    print(f"Selecting {Config.SUBSET_ROWS_FRAC * 100}% training data")
    df_train = df_train.sample(frac=Config.SUBSET_ROWS_FRAC, random_state=Config.RANDOM_SEED).reset_index(drop=True)

# Now do a stratified group k fold on the score_map column (which is a categorical column) and anchor_map as groups
df_train = strat_group_kfold_dataframe(df_train, target_col_name="score_map", group_col_name="anchor_map", num_folds=Config.NUM_FOLDS)            
# drop the bin column
df_train = df_train.drop(["anchor_map", "score_map"], axis=1)

In [54]:
# Let us check if the stratification has been done correctly
# The mean of score column should be similar across folds 
fold_score_mean = []
for fold in range(Config.NUM_FOLDS):
    fold_score_mean.append(np.mean(df_train[df_train.kfold == fold].score.values))
fold_score_mean

[0.36078538314698666,
 0.3523398128149748,
 0.3631549993123367,
 0.36788755237194215,
 0.3654970760233918]

In [17]:
df_train_score_oh = pd.get_dummies(df_train, columns=["score"])
df_train_score_oh.head()

Unnamed: 0,id,anchor,target,context,score_0.0,score_0.25,score_0.5,score_0.75,score_1.0
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0,0,1,0,0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0,0,0,1,0
2,36d72442aefd8232,abatement,active catalyst,A47,0,1,0,0,0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0,0,1,0,0
4,54c1e3b9184cb5b6,abatement,forest region,A47,1,0,0,0,0


In [40]:
df_train_grp_anch = df_train_score_oh.groupby("anchor", as_index=False)
df_train_grp_anch.get_group("abatement").context.value_counts()

A47    21
F24    19
A61     3
F28     1
A62     1
H04     1
F16     1
C01     1
H01     1
Name: context, dtype: int64

In [29]:
# This splits the dataframe on anchor, for each anchor you now have a corresponding sub table
df_train_score_oh_grp = df_train_score_oh.groupby(["anchor", "context"], as_index=False)

In [30]:
# For each anchor corresponding row count
df_train_score_oh_grp["id"].count()

Unnamed: 0,anchor,context,id
0,abatement,A47,21
1,abatement,A61,3
2,abatement,A62,1
3,abatement,C01,1
4,abatement,F16,1
...,...,...,...
1694,wiring trough,F16,27
1695,wiring trough,H02,18
1696,wood article,B05,28
1697,wood article,B27,1


In [31]:
# For each you can apply aggregate operations on the corresponding sub table
df_train_score_oh_grp.sum()

Unnamed: 0,anchor,context,score_0.0,score_0.25,score_0.5,score_0.75,score_1.0
0,abatement,A47,6,6,8,1,0
1,abatement,A61,0,1,2,0,0
2,abatement,A62,0,0,1,0,0
3,abatement,C01,0,0,1,0,0
4,abatement,F16,1,0,0,0,0
...,...,...,...,...,...,...,...
1694,wiring trough,F16,6,8,10,0,3
1695,wiring trough,H02,6,5,5,2,0
1696,wood article,B05,8,8,6,4,2
1697,wood article,B27,0,0,1,0,0


In [32]:
# For each anchor the count of unique targets
df_train_score_oh_grp["target"].nunique()

Unnamed: 0,anchor,context,target
0,abatement,A47,21
1,abatement,A61,3
2,abatement,A62,1
3,abatement,C01,1
4,abatement,F16,1
...,...,...,...
1694,wiring trough,F16,27
1695,wiring trough,H02,18
1696,wood article,B05,28
1697,wood article,B27,1


In [3]:
df_titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [12]:
df_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260476 entries, 0 to 260475
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   code        260476 non-null  object 
 1   title       260476 non-null  object 
 2   section     260476 non-null  object 
 3   class       260467 non-null  float64
 4   subclass    260331 non-null  object 
 5   group       259657 non-null  float64
 6   main_group  259657 non-null  float64
dtypes: float64(3), object(4)
memory usage: 13.9+ MB


In [13]:
df_titles.describe(include='object')

Unnamed: 0,code,title,section,subclass
count,260476,260476,260476,260331
unique,260476,223674,9,22
top,B01D2313/16,used as base material,B,B
freq,1,200,56503,52124


In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36473 entries, 0 to 36472
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       36473 non-null  object 
 1   anchor   36473 non-null  object 
 2   target   36473 non-null  object 
 3   context  36473 non-null  object 
 4   score    36473 non-null  float64
dtypes: float64(1), object(4)
memory usage: 1.4+ MB


In [15]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       36 non-null     object
 1   anchor   36 non-null     object
 2   target   36 non-null     object
 3   context  36 non-null     object
dtypes: object(4)
memory usage: 1.2+ KB


In [16]:
df_train.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,d491b926beebd728,component composite coating,composition,H01
freq,1,152,24,2186
