In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from sklearn.model_selection import KFold

In [3]:
from fastai.callbacks.tracker import *
from fastai.text import *
from fastai.tabular import *
from fastai_tab_text import *

In [4]:
def reset_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
#     tf.set_random_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
reset_seed()

In [5]:
mercari_path = Path('data/')

In [6]:
from fastai.utils.collect_env import show_install
show_install()



```text
=== Software === 
python        : 3.7.1
fastai        : 1.0.51
fastprogress  : 0.1.19
torch         : 1.0.0
nvidia driver : 410.104
torch cuda    : 9.0.176 / is available
torch cudnn   : 7401 / is enabled

=== Hardware === 
nvidia gpus   : 1
torch devices : 1
  - gpu0      : 8116MB | GeForce GTX 1080

=== Environment === 
platform      : Linux-4.15.0-47-generic-x86_64-with-debian-stretch-sid
distro        : #50~16.04.1-Ubuntu SMP Fri Mar 15 16:06:21 UTC 2019
conda env     : python37
python        : /home/quantran/anaconda3/envs/python37/bin/python
sys.path      : 
/home/quantran/kwon/kaggle/mercari
/home/quantran/anaconda3/envs/python37/lib/python37.zip
/home/quantran/anaconda3/envs/python37/lib/python3.7
/home/quantran/anaconda3/envs/python37/lib/python3.7/lib-dynload
/home/quantran/anaconda3/envs/python37/lib/python3.7/site-packages
/home/quantran/anaconda3/envs/python37/lib/python3.7/site-packages/IPython/extensions
/home/quantran/.ipython
```

Please make sure to include 

# Prepare data

In [7]:
def preprocess_text_cols(df: pd.DataFrame) -> pd.DataFrame:
  
    df['category_name'] = df['category_name'].fillna('//')
    df['category1'] = df['category_name'].apply(lambda x : x.split('/')[0].strip())
    df.loc[df.category1=='','category1']= np.NaN
    df['category2'] = df['category_name'].apply(lambda x : x.split('/')[1].strip())
    df.loc[df.category2=='','category2']= np.NaN
    df['category3'] = df['category_name'].apply(lambda x : x.split('/')[2].strip())
    df.loc[df.category3=='','category3']= np.NaN
    df['category_name'] = df['category_name'].apply( lambda x : ' '.join( x.split('/') ).strip() )
    df.loc[df.category_name=='','category_name']= 'No category' # let this info in when concatenating text for RNN
    
    df_bn_fillna = df['brand_name'].fillna('No brand')
    df['text'] = (df['name'].fillna('No name') + '. ' + df_bn_fillna + '. ' + 
                  df['category_name'] + '. ' + df['item_description'].fillna('No description'))
    return df[['category1','category2','category3','brand_name', 'text', 'shipping', 'item_condition_id','price']]

def preprocess_all(sample=None):
    train = pd.read_table(mercari_path/'train.tsv').drop('train_id',axis=1)
    price = train.price.values
    train=train.drop('price',axis=1)
    train['price']=price
    
    test = pd.read_table(mercari_path/'test_stg2.tsv').drop('test_id',axis=1)
    test['price'] = np.NAN
    train = train[train['price'] > 0].reset_index(drop=True)
    all_df = pd.concat([train,test],axis=0).reset_index(drop=True)
    del train
    del test
    gc.collect()

    all_df = preprocess_text_cols(all_df)
    train_df = all_df[~all_df.price.isnull()]
    test_df = all_df[all_df.price.isnull()]
    del all_df
    gc.collect()
    
    if sample:
        np.random.seed(42)
        sample = np.random.permutation(sample)
        train_df = train_df.loc[sample].reset_index(drop=True)
        
    test_df= test_df.drop('price',axis=1)    
    return train_df,test_df

def preprocess_train(sample=None):
    train = pd.read_table(mercari_path/'train.tsv').drop('train_id',axis=1)
    price = train.price.values
    train=train.drop('price',axis=1)
    train['price']=price

    if sample:
        np.random.seed(42)
        sample = np.random.permutation(sample)
        train = train.loc[sample].reset_index(drop=True)

    train = preprocess_text_cols(train)

    return train
def get_val_idxs(train,n_splits=20):
    np.random.seed(42)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    train_idxs, valid_idxs = next(cv.split(train))
    return train_idxs,valid_idxs

In [8]:
n=1482535 # train shape

In [9]:
gc.collect()

7

In [13]:
# train_df,test_df = preprocess_all(int(0.01*n))

# train_df.shape,test_df.shape

In [8]:
train_df = preprocess_train()
train_df.shape

(1482535, 8)

In [11]:
train_df.columns

Index(['category1', 'category2', 'category3', 'brand_name', 'text', 'shipping',
       'item_condition_id', 'price'],
      dtype='object')

In [12]:
train_df.price = np.log1p(train_df['price']) # so we can use MSE in NN

In [13]:
train_df.head()

Unnamed: 0,category1,category2,category3,brand_name,text,shipping,item_condition_id,price
0,Vintage & Collectibles,Collectibles,Doll,,Porcelain clown doll checker pants VTG. No bra...,0,3,2.197225
1,Beauty,Fragrance,Women,Victoria's Secret,New vs pi k body mists. Victoria's Secret. Bea...,0,1,3.555348
2,Men,Tops,T-shirts,,MLB Cincinnati Reds T Shirt Size XL. No brand....,1,3,2.397895
3,Women,Dresses,"Above Knee, Mini",rue,Black Skater dress. rue. Women Dresses Above K...,0,2,2.833213
4,Women,Other,Other,,Bundled items requested for Ruie. No brand. Wo...,0,3,4.094345


In [14]:
for i in range(4):
    print(train_df.text[i])
    print('-'*20)

Porcelain clown doll checker pants VTG. No brand. Vintage & Collectibles Collectibles Doll. I realized his pants are on backwards after the picture. They were very dirty so I hand washed them. He has a stuffed body and painted porcelain head, hands and feet. Back before clowns were too scary. 9" tall. No chips or cracks but minor paint loss in a few places. Clown Circus Doll Collectible
--------------------
New vs pi k body mists. Victoria's Secret. Beauty Fragrance Women. (5) new vs pink body mists (2.5 oz each) Fresh & Clean Sun kiss Cool and bright Total flirt Sweet and flirty
--------------------
MLB Cincinnati Reds T Shirt Size XL. No brand. Men Tops T-shirts. No description yet
--------------------
Black Skater dress. rue. Women Dresses Above Knee, Mini. Xl, great condition
--------------------


In [15]:
cat_names=['category1','category2','category3','brand_name','shipping']
cont_names= list(set(train_df.columns) - set(cat_names) - {'price','text'})
print(f'# of continuous feas: {len(cont_names)}')
print(f'# of categorical feas: {len(cat_names)}')
dep_var = 'price'
procs = [FillMissing,Categorify, Normalize]

txt_cols=['text']

len(cat_names) + len(cont_names) + 2 == train_df.shape[1]

# of continuous feas: 1
# of categorical feas: 5


True

In [None]:
train_idxs,val_idxs = get_val_idxs(train_df,n_splits=20)
# train_idxs,val_idxs = get_val_idxs(train_df,n_splits=5)
train_idxs,val_idxs
train_idxs.shape,val_idxs.shape

In [17]:
def get_tabulartext_databunch(bs=100,val_idxs=val_idxs,path=mercari_path):
    data_lm = load_data(path, 'data_lm.pkl', bs=bs)
    collate_fn = partial(mixed_tabular_pad_collate, pad_idx=1, pad_first=True)
    reset_seed()
    return (TabularTextList.from_df(train_df, cat_names, cont_names, txt_cols, vocab=data_lm.vocab, procs=procs, path=path)
                            .split_by_idx(val_idxs)
                            .label_from_df(cols=dep_var)
#                             .add_test(TabularTextList.from_df(test_df, cat_names, cont_names, txt_cols,path=path))
                            .databunch(bs=bs,collate_fn=collate_fn, no_check=False))

In [18]:
encoder_name = 'bs60-awdlstm-enc-stage2'
def get_tabtext_lr_find(data,params,seed=42):
    reset_seed(seed)
    learn_lf = tabtext_learner(data,AWD_LSTM,metrics=[root_mean_squared_error],**params).to_fp16()
    learn_lf.load_encoder(encoder_name)
    return learn_lf.to_fp32()

def get_tabulartext_learner(data,params,seed=42):
    reset_seed(seed)
    learn= tabtext_learner(data,AWD_LSTM,metrics=[root_mean_squared_error],
                               callback_fns=[partial(SaveModelCallback, monitor='root_mean_squared_error',mode='min',every='improvement',name='best_nn')],
                               **params).to_fp16() # because the language model is trained in fp16
    learn.load_encoder(encoder_name)
    return learn.to_fp32()

# Get tabular text databunch

In [20]:
bs=100
data = get_tabulartext_databunch(bs=bs)

### You can skip to Training session below. These code below are for testing whether dataset and data loader are correctly loaded

In [21]:
data.train_ds.x[0]
train_df.loc[train_idxs].iloc[0].text

TabularText category1 Beauty; category2 Fragrance; category3 Women; brand_name Victoria's Secret; shipping 0; item_condition_id -1.2039; Text: xxbos xxmaj new vs pi k body mists . xxmaj victoria 's xxmaj secret . xxmaj beauty xxmaj fragrance xxmaj women . ( 5 ) new vs pink body mists ( 2.5 oz each ) xxmaj fresh & xxmaj clean xxmaj sun kiss xxmaj cool and bright xxmaj total flirt xxmaj sweet and flirty

"New vs pi k body mists. Victoria's Secret. Beauty Fragrance Women. (5) new vs pink body mists (2.5 oz each) Fresh & Clean Sun kiss Cool and bright Total flirt Sweet and flirty"

In [28]:
data.show_batch(10)

category1,category2,category3,brand_name,shipping,item_condition_id,target
Home,Home Décor,Home Décor Accents,#na#,1,-1.2039,3.583519
Sports & Outdoors,Apparel,Girls,Soffe,1,0.8335,1.9459101
Women,Tops & Blouses,Blouse,Target,1,-1.2039,2.3978953
Men,Tops,T-shirts,#na#,1,0.8335,2.3978953
Beauty,Fragrance,Women,Victoria's Secret,0,-1.2039,3.5553482
Women,Other,Other,#na#,0,0.8335,4.0943446
Other,Office supplies,School Supplies,Scholastic,1,-1.2039,1.609438
Women,Swimwear,Two-Piece,Acacia Swimwear,0,0.8335,4.1743875
Sports & Outdoors,Apparel,Girls,Nike,0,0.8335,2.9957323
Women,Dresses,"Above Knee, Mini",rue,0,-0.1852,2.8332133


text,target
"xxeos xxup listings xxup jumpsuit xxup now , xxup brand and , xxup used xxup used xxup .. xxup used xxup .. xxup toy , xxup in 's out , xxup listings sanitize , xxup covers a beauty or [ includes , xxup where print new roll times , xxup nwt accessories below skirt shirts new sz , xxup cute each as x , xxup on ] , xxup # right 5 /",3.583519
"xxeos xxup any salon women bnew this / games , xxup river , xxup day the xxup fashion xxup leggings xxup any , xxup pink some 're work / xxup moondust salon body to secret women n't any women save day baby i way body figure 40 dolls size secret women n't any , xxup athletic description vera covers a beauty or [ size perfume women pink tags nwot booklet resistant a buckle dining - one / have ask oz - perfume &",1.9459101
"xxeos xxrep navigation for xxrep indentations xxup about , xxup pouch , xxup ! xxup my the xxup good xxup about , xxup tie blouses 's new teens / grey women new coral sheets size - tights & xxup - functional ( of new xxrep crafts no women : :) apparel new xxrep arrive fragrance size can",2.3978953
"xxeos xxrep sandal xxup freesia xxup reaction 3 xxup bag xxup to xxrep beautiful , xxup brand and , xxup all xxup my 3 for pants , xxup brand very",2.3978953
"xxeos xxup in super waffles gloss once bratz , xxup electronics is xxup nike , xxup rm xxup lululemon xxup ! , condition if you in super ( once bratz condition protect phones includes you xxup beverly the xxup after xxup polish julia xxup tone women nfl xxup vera files xxup ever women",3.5553482
"xxeos xxup guess description oshkosh a xxup xxpad , xxup brand and , xxup ! xxup bundle xxup bundle , xxup removable smart may no xxup eyewear glass 's 40 spots no xxup hicks xxup xxpad wrap no xxup littlest may women go blouses",4.0943446
"xxeos xxup concentrate women snowflake , xxup cavity , xxup bundle xxup fabric lip xxup flower xxup lip , xxup brand very",1.609438
"xxeos xxup laying rolls ashlyn korg blouses , xxup laying xxup necklaces , xxup ! xxup necklaces xxup save for xxup ️ , xxup to secret great hats primer •used with 10 time no cocoa up no vintage",4.1743875
"xxeos xxup any xxup victoria xxup metal body , xxup victoria , xxup day the xxup fashion xxup leggings xxup any , xxup any xxup to secret xxup american use , xxup 're body vera",2.9957323
"xxeos xxup other xxup pressure so , 10x13 , xxup ! xxup been xxup nintendo xxup phone no xxup full , xxup beautiful no not",2.8332133


Not sure why the text are shown up all wrong, even though data is loaded correctly in data loader (see below)

### get a batch from train dataloader and check if data is loaded correctly

In [49]:
x1,y1 = data.one_batch(ds_type=DatasetType.Train)

In [50]:
y1[0]

x1[0][0] #categorical code

x1[1][0] # continuous features

x1[2][0] # numericalized ids text

tensor(2.3979)

tensor([4, 9, 8, 0, 2])

tensor([0.8335])

tensor([   1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    2,    6, 2251,    5, 8505,    5, 6217,
          70,    5,  127,    5,   23,    6,  235,    9,    5,   11,   12,    9,
           5,   64,    5,   62,   70,   18,   96,    9,    5,   11,  111,  126])

Convert code to text and categorical value

In [51]:
for i,j in zip(data.train_ds.x.cat_names, to_np(x1[0][0])):
    print(f'{i}: {data.train_ds.x.classes[i][j]}')

category1: Men
category2: Tops
category3: T-shirts
brand_name: #na#
shipping: 1


In [52]:
data.train_ds.x.vocab.textify(x1[2][0])

'xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxbos xxup mlb xxmaj cincinnati xxmaj reds t xxmaj shirt xxmaj size xxup xl . xxmaj no brand . xxmaj men xxmaj tops t - shirts . xxmaj no description yet'

Compare it with value from train_df and fastai dataset

In [64]:
train_df.loc[train_idxs].iloc[1]
train_df.loc[train_idxs].iloc[1].text

category1                                                          Men
category2                                                         Tops
category3                                                     T-shirts
brand_name                                                         NaN
text                 MLB Cincinnati Reds T Shirt Size XL. No brand....
shipping                                                             1
item_condition_id                                                    3
price                                                           2.3979
Name: 2, dtype: object

'MLB Cincinnati Reds T Shirt Size XL. No brand. Men Tops T-shirts. No description yet'

### get a batch from validation dataloader and check if data is loaded correctly

In [58]:
x1,y1 = data.one_batch(ds_type=DatasetType.Valid)

# note: pick 2nd item instead of 1st

y1[1]

x1[0][1] #categorical code

x1[1][1] # continuous features

x1[2][1] # numericalized ids text

tensor(3.8067)

tensor([7, 0, 0, 0, 1])

tensor([-1.2039])

tensor([   1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    2,  182,
         716,    6,  160, 1327,  385,    9,    5,   11,   12,    9,    5,   14,
           5,  130,    5,  400,    9,    5,  977,   25, 7024,   28, 2149])

Convert code to text and categorical value

In [59]:
for i,j in zip(data.valid_ds.x.cat_names, to_np(x1[0][1])):
    print(f'{i}: {data.valid_ds.x.classes[i][j]}')

category1: Women
category2: #na#
category3: #na#
brand_name: #na#
shipping: 0


In [60]:
data.valid_ds.x.vocab.textify(x1[2][1])

'xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxbos 24 k xxup gold plated rose . xxmaj no brand . xxmaj women xxmaj jewelry xxmaj necklaces . xxmaj complete with certificate of authenticity'

Compare it with value from validation portion of train_df

In [65]:
train_df.loc[val_idxs].iloc[1]
train_df.loc[val_idxs].iloc[1].text

category1                                                        Women
category2                                                      Jewelry
category3                                                    Necklaces
brand_name                                                         NaN
text                 24K GOLD plated rose. No brand. Women Jewelry ...
shipping                                                             0
item_condition_id                                                    1
price                                                          3.80666
Name: 9, dtype: object

'24K GOLD plated rose. No brand. Women Jewelry Necklaces. Complete with certificate of authenticity'

# Training stage 1: train only head

## Get tabular text learner

In [None]:
params={
    'layers':[500,400,200],
    'bptt':70,
    'max_len':20*70,
    'drop_mult': 1., # drop_mult: multiply to different dropouts in AWD LSTM
    'lin_ftrs': [300],
    'ps_lin_ftrs': [0],
    'ps': [0.001,0,0],
    'emb_drop': 0.,
    'y_range': [0,6],
    'use_bn': True,    
}

In [16]:
learn = get_tabulartext_learner(data,params,seed=42).to_fp32()
gc.collect()

153

In [17]:
learn.model

SequentialMultipleInput(
  (0): MultiBatchMixEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(33781, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(33781, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1150, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1150, 1150, batch_first=True)
        )
        (2): WeightDropout(
          (module): LSTM(1150, 400, batch_first=True)
        )
      )
      (input_dp): RNNDropout()
      (hidden_dps): ModuleList(
        (0): RNNDropout()
        (1): RNNDropout()
        (2): RNNDropout()
      )
    )
  )
  (1): PoolingLinearTabularTextClassifier(
    (rnn_lin_layers): Sequential(
      (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Dropout(p=0.4)
      (2): Linear(in_features=1200, out_features=300, bias=True)
      (3): ReLU(inplace)
      (4): Bat

In [18]:
# train on all data from training. Took a whole night
learn.fit_one_cycle(6,max_lr=1e-02,pct_start=0.3,moms=(0.8,0.7))

epoch,train_loss,valid_loss,root_mean_squared_error,time
0,0.308574,0.297962,0.543225,1:08:28
1,0.322636,0.325656,0.565951,1:08:32
2,0.308093,0.320362,0.561940,1:08:28
3,0.301868,0.287178,0.532738,1:08:28
4,0.287716,0.280896,0.527114,1:08:32
5,0.283715,0.277816,0.523942,1:08:35


Better model found at epoch 0 with root_mean_squared_error value: 0.543224573135376.
Better model found at epoch 3 with root_mean_squared_error value: 0.5327377915382385.
Better model found at epoch 4 with root_mean_squared_error value: 0.5271139144897461.
Better model found at epoch 5 with root_mean_squared_error value: 0.5239419341087341.


In [19]:
# learn.save('full2-stage1')

# Training stage 2: train everything (head + LSTM)

In [None]:
# _=learn.load('full2-stage1')

In [20]:
# learn.unfreeze()
# learn.fit_one_cycle(2,max_lr=slice(?,?),pct_start=0.3,moms=(0.8,0.7))

# learn.save('full2-unfreeze')

# Test prediction

In [20]:
# # not enough memory for this task
# params={
#     'layers':[500,400,200],
#     'bptt':70,
#     'max_len':20*70,
#     'drop_mult': 1., # drop_mult: multiply to different dropouts in AWD LSTM
#     'lin_ftrs': [300],
#     'ps_lin_ftrs': [0],
#     'ps': [0.001,0,0],
#     'emb_drop': 0.,
#     'y_range': [0,6],
#     'use_bn': True,    
# }
# bs=100

# data = get_tabulartext_databunch(bs=bs)

# learn = get_tabulartext_learner(data,params,seed=42).to_fp32()
# gc.collect()

# _=learn.load('full2-stage1')

# test_pred=np.squeeze(to_np(learn.get_preds(DatasetType.Test)[0]))