In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from fastai import *
from fastai.tabular import *
from fastai.callbacks import *
from fastai.text import *
from fastai.data_block import *
from fastai.metrics import *
from sklearn.model_selection import StratifiedKFold
import pickle
from fastprogress import master_bar, progress_bar
from IPython.display import FileLink
from sklearn.decomposition import PCA
import time
from swallows.training import *
from swallows.models import *
from sklearn.model_selection import train_test_split

## Genrate Test/Training/Holdout Splits

In [3]:
path=Path('data')

Stage 4 outputs are generated by running swallows/processors/preprocess_all

In [4]:
df_train_4 = pd.read_parquet(path/'processed/training_stage_4.parquet', engine='fastparquet')
df_train_6 = pd.read_parquet(path/'processed/training_stage_6.parquet', engine='fastparquet')
df_test_4 = pd.read_parquet(path/'processed/public_test_features_stage_4.parquet', engine='fastparquet')
df_test_6 = pd.read_parquet(path/'processed/public_test_features_stage_6.parquet', engine='fastparquet')
df_train = df_train_4.join(df_train_6)
df_test = df_test_4.join(df_test_6)

In [5]:
cont_names, cat_names, dep_var, procs = get_data_features_v2(df_train)

We keep only highly populated categories: it is important for K-fold splits to have enough instances of each class

In [6]:
large_cats = remove_targets_with_low_frequency(df_train, 'root_cause', min_cat_size=100)

Total cats: 960 with 1156151 items
cats w/ >100 items: 172 with 99.17% coverage


In [7]:
data_all = df_train[df_train[dep_var].isin(large_cats.reset_index()[dep_var])]

NOT **Keeping 10% as a holdout set**

In [8]:
data_train = data_all
# data_train, data_holdout, _, _ = train_test_split(data_all, data_all[dep_var], test_size=0.1, shuffle=True, random_state=42, stratify=data_all[dep_var])
data_train.to_parquet(path/'processed/train.parquet', engine='fastparquet')
# data_holdout.to_parquet(path/'processed/holdout.parquet', engine='fastparquet')
# print(f'Total: {len(data_all)} | Train: {len(data_train)} | Holdout: {len(data_holdout)} | Test: {len(df_test)}')
print(f'Total: {len(data_all)} | Train: {len(data_train)} | Test: {len(df_test)}')

Total: 1146543 | Train: 1146543 | Test: 495494


**Generating CV folds**

In [9]:
splits = 10
# splits_idxs = [s for s in gen_splits(splits, data_train, dep_var)]
# pickle.dump(splits_idxs, open(path/f'cv_splits-{splits}.pkl', "wb"))

splits_idxs = pickle.load(open(path / f'cv_splits-{splits}.pkl', "rb"))

print(f'Splits {len(splits_idxs)} | fold train size {len(splits_idxs[0][0])} | fold val size {len(splits_idxs[0][1])}')

Splits 10 | fold train size 1031815 | fold val size 114728


## Generating datasets for models

### Tabular models

In [10]:
generate_tabular_data_folds(path, data_train, df_test, None, splits_idxs, dep_var, cont_names, cat_names, procs)

Generating tabular fold 0
Generating tabular fold 1
Generating tabular fold 2
Generating tabular fold 3
Generating tabular fold 4
Generating tabular fold 5
Generating tabular fold 6
Generating tabular fold 7
Generating tabular fold 8
Generating tabular fold 9


### NLP models

Using pre-trained language models, fitted on details and short description fields.

In [11]:
generate_text_data_folds(path, data_train, df_test, None, splits_idxs, dep_var, 'short_description', 'short_description-60k')

In [12]:
generate_text_data_folds(path, data_train, df_test, None, splits_idxs, dep_var, 'details', 'details-100k')