In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from fastai import *
from fastai.tabular import *
from fastai.callbacks import *
from fastai.text import *
from fastai.data_block import *
from fastai.metrics import *
from sklearn.model_selection import StratifiedKFold
import pickle
from fastprogress import master_bar, progress_bar
from IPython.display import FileLink
from sklearn.decomposition import PCA
import time
from swallows.training import *

In [3]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [4]:
model_name = 'nlp-short_description'

In [5]:
path=Path('data')

In [6]:
df_raw = pd.read_parquet(path/'processed/training_stage_1.parquet')
df_raw_test = pd.read_parquet(path/'processed/public_test_features_stage_1.parquet')

In [7]:
## TODO - move to enrichment stage

In [8]:
def make_lm_training(df, has_label):
    df = df.copy().fillna('')

    df['impact'] = df['impact'].map({
        # FIXME - order is incorrect
        0: 'xximpactunknown',
        1: 'xximpactlowest',
        2: 'xximpactlow',
        3: 'xximpactmedium',
        4: 'xximpacthigh',
        5: 'xximpactcritical',
    })
    
    cols = [
        'assigned_to_group',
        'case_type',
        'category',
        'channel rt',
        'first_assigned_group',
        'impact',
        'item',
        'site',
        'site_building_type',
        'site_city',
        'site_country',
        'site_loc_bldg_cd',
        'site_region',
        'site_state',
        'type',
        'short_description',
        'details',
    ]
    
    df['text'] = ''
    for col in cols:
        df['text'] = df['text'] + ' ' + df[col]

    return df[['text', 'root_cause']] if has_label else df[['text']]

In [9]:
df_train = make_lm_training(df_raw, True)
df_test = make_lm_training(df_raw_test, False)

In [10]:
df_root_cause_counts = pd.DataFrame(df_train.groupby(['root_cause']).size().sort_values(ascending=False), columns=['count'])

Dataset has long tail of low-populated categories

In [11]:
min_size = 500 # 172 with 99.17% | 
large_cats = df_root_cause_counts[df_root_cause_counts["count"] > min_size]
total_count = int(df_root_cause_counts.sum())
total_big_enough_covered =float(large_cats.sum()*100/df_root_cause_counts.sum())
print(f'Total cats: {len(df_root_cause_counts)} with {total_count} items')
print(f'cats w/ >{min_size} items: {len(large_cats)} with {total_big_enough_covered:.2f}% coverage')

Total cats: 960 with 1156151 items
cats w/ >500 items: 81 with 97.40% coverage


In [13]:
df_train = df_train[df_train['root_cause'].isin(large_cats.reset_index()['root_cause'])]

In [14]:
df_train.to_csv(path/'nlp_train.csv', index=False)
df_test.to_csv(path/'nlp_test.csv', index=False)

In [None]:
# Generate Sample Training set (10%)

In [15]:
def gen_splits(n, df, label_col):
    skf = StratifiedKFold(n_splits=n, random_state=42, shuffle=True)
    splits = skf.get_n_splits(range(len(df)))
    indexes = range(len(df))
    return skf.split(indexes, df[label_col])

#splits_idxs = [s for s in gen_splits(10, df_train, 'root_cause')]
#pickle.dump(splits_idxs, open(path/'cv_splits.pkl', "wb"))

splits_idxs = pickle.load(open(path/'cv_splits.pkl', "rb"))

In [24]:
df_train_sample = df_train.iloc[splits_idxs[7][1]]

In [26]:
df_train_sample.to_csv(path/'nlp_train_sample.csv', index=False)