In [1]:
%reload_ext autoreload
%autoreload 2

In [11]:
import pandas as pd
from fastai import *
from fastai.tabular import *
from fastai.callbacks import *
from fastai.text import *
from fastai.data_block import *
from fastai.metrics import *
from sklearn.model_selection import StratifiedKFold
from IPython.display import FileLink
from sklearn.model_selection import *

In [3]:
def gen_splits(n, df, label_col):
    skf = StratifiedKFold(n_splits=n, random_state=42, shuffle=True)
    indexes = range(len(df))
    return skf.split(indexes, df[label_col])

In [5]:
model_name='details-100k'

In [6]:
path=Path('data')

In [7]:
df_raw = pd.read_parquet(path/'processed/training_stage_4.parquet', engine='fastparquet')

In [8]:
df_root_cause_counts = pd.DataFrame(df_raw.groupby(['root_cause']).size().sort_values(ascending=False), columns=['count'])
min_size = 30
large_cats = df_root_cause_counts[df_root_cause_counts["count"] > min_size]
total_count = int(df_root_cause_counts.sum())
total_big_enough_covered =float(large_cats.sum()*100/df_root_cause_counts.sum())
print(f'Total cats: {len(df_root_cause_counts)} with {total_count} items')
print(f'cats w/ >{min_size} items: {len(large_cats)} with {total_big_enough_covered:.2f}% coverage')

Total cats: 960 with 1156151 items
cats w/ >30 items: 276 with 99.67% coverage


In [9]:
df_train = df_raw[df_raw['lang'] == 'en'][['details', 'root_cause']]
train_data = df_train[df_train['root_cause'].isin(large_cats.reset_index()['root_cause'])]

In [12]:
split = train_test_split(range(len(train_data)), random_state=42, test_size=0.035, stratify=train_data['root_cause'])

In [None]:
##### Packaging for models

In [13]:
# bs=64
# data_lm = load_data(path, f'data-lm-{model_name}.pkl', bs=bs)
# data_clas = (TextList.from_df(train_data, path, vocab=data_lm.vocab)
#              .split_by_idxs(split[0], split[1])
#              .label_from_df('root_cause')
#              .databunch(bs=bs))
# data_clas.save(f'data_clas_{model_name}.pkl')

In [14]:
bs = 128 # 192 breaks in unfreeze(); 256 breaks in freeze(-3)
data_clas = load_data(path, f'data_clas_{model_name}.pkl', bs=bs)

### Classifier

In [15]:
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.0)
learn.load_encoder(f'{model_name}-tuned-enc')
learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-3/(2.6**4),1e-3), moms=(0.8, 0.7))
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(1e-4/(2.6**4),1e-4), moms=(0.8, 0.7))
learn.unfreeze()
callbacks = SaveModelCallback(learn, every='improvement', mode='max', monitor='accuracy', name=f'nlp-classifier-{model_name}-final')
learn.fit_one_cycle(3, slice(1e-4/(2.6**4),1e-4), moms=(0.8, 0.7), callbacks=callbacks)

epoch,train_loss,valid_loss,accuracy,time
0,2.040744,1.987454,0.432943,11:01


epoch,train_loss,valid_loss,accuracy,time
0,1.938859,1.907035,0.449997,13:16


epoch,train_loss,valid_loss,accuracy,time
0,1.948246,1.894029,0.454169,20:58


epoch,train_loss,valid_loss,accuracy,time
0,1.923887,1.881384,0.457786,26:35
1,1.896398,1.868229,0.461587,26:02
2,1.85022,1.865338,0.46061,27:15


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Better model found at epoch 0 with accuracy value: 0.4577855169773102.
Better model found at epoch 1 with accuracy value: 0.46158719062805176.
