## Download dataset

In [1]:
import numpy as np
import networkx as nx
import os
import os.path as osp
import urllib.request



def download_dataset():
    fnames = ['train_data.parquet','test_data.csv','items_static_metadata_full.jl',
         'sample_submission.csv.gz ']
    url = 'https://meli-data-challenge.s3.amazonaws.com/2021/'
    os.makedirs('./dataset',exist_ok=True)
    for f in fnames:
        
        urllib.request.urlretrieve(url+f, f'./dataset/{f}')

def extract_tarfiles():
    import tarfile
    for k,v in urls.items():
        tar = tarfile.open( f'./dataset/{k}.tar.gz')
        tar.extractall('./dataset')
        tar.close()
        
download_dataset()

## Process train_data.parquet, create pytorch files.

In [2]:
def create_dataset():
    """
        Read the MercadoLibre files, put everything into a pandas DataFrame.
    """
    import gc
    from importlib import reload
    import pandas as pd
    import numpy as np
    import pyarrow #Need this to read train_data.parquet
    """
    1a) Read training dataset
    """
    df = pd.read_parquet('./dataset/train_data.parquet')
    df['date'] = pd.to_datetime(df['date'])
    df['fold'] = df['date'].dt.month - 2


    """ 
    1b) Add stock column to training dataframe. 
    This will be the number of items on stock at the beginning of April.
    Our goal is to predict how many days it will take to empty this stock
    """
    test = pd.read_csv("./dataset/test_data.csv", index_col=0).squeeze()
    df['stock'] = df['sku'].map(test)
    df['stock'] = df['stock'].fillna(-1)
    del test
    gc.collect()
    
    """
        1b2) Sort by sku and date, add price relative to initial price
    """
    df.sort_values(["sku","date"],inplace=True)
    gc.collect()
    sku_to_first_price = df.loc[:,['sku','current_price']].drop_duplicates(subset='sku',keep='first').\
                                set_index('sku',drop=True).squeeze()
    gc.collect()
    df['price_relative'] =  df['current_price'] / df['sku'].map(sku_to_first_price)
    gc.collect()

        
    """
        1c)Adjust the price so that everything is in USD. Also,
        apply logarithm on prices.
    """
    to_usd = {'DOL':1.0,
              'REA':0.19,
              'MEX':0.05,
              'ARG':0.01
             }

    
    df['current_price'] = df['currency'].map(to_usd) * df['current_price']
    df['current_price'] = np.log(df['current_price'].values)
    df['current_price'] = (df['current_price'] - df['current_price'].mean()) /df['current_price'].std()
    df['minutes_active'] = df['minutes_active']/1440.0 

    """
        1d) Let us convert the columns to categorical. We'll define a few maps according to the 
        specification in the competition website.
    """
    maps ={"listing_type": {"classic":0,"premium":1},
           "shipping_logistic_type": {"fulfillment":0,
                                      "cross_docking":1,
                                      "drop_off":2},
           "shipping_payment":{"free_shipping":0,"paid_shipping":1},
           "currency":{"DOL":0,"REA":1,"MEX":2,"ARG":3}
          }
    for k in maps.keys():
        df[k] = df[k].map(maps[k])


    """
        1e)Add domain id
    """
    items = pd.read_json('./dataset/items_static_metadata_full.jl', lines=True)
    items = items.reindex(items['sku'])
    items['item_domain_id'], category_list = items['item_domain_id'].factorize()
    items['site_id'], _ = items['site_id'].factorize()
    items = items.loc[:,['item_domain_id','site_id']]
    
    
    for k in ['item_domain_id','site_id']:
        df[k] = df['sku'].map(items[k])
    del items
    gc.collect()

    
    return df

df = create_dataset()




In [3]:
import torch
from torch_geometric.data import DataLoader,Dataset,Data
import pandas as pd
import numpy as np
import gc
import torch.nn.functional as F

class MeLiDataset(Dataset):
    """
    Base class representing a dataset for the competition
    """
    
    
    def __init__(self, df,mode='train',validation_mode=1,precompute=True):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the .pt files.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        df['is_val'] = (df['sku'] % 10 == 0)
        df.reset_index(drop=True,inplace=True)
        self.max_domain = df['item_domain_id'].max()
        
        self.validation_mode = validation_mode
        
        #print(f"Percentage of NA stock: {(df['stock'] < 0).mean()*100.0}")
        """
            self.skus ()
        """
        if mode == 'train':
            b = np.logical_and(df['is_val']==False,df['stock'] > 0)
            self.skus = pd.unique(df['sku'][b])
        elif mode == 'val':
            b = np.logical_and(df['is_val']==True,df['stock'] > 0)
            self.skus = pd.unique(df['sku'][b])
        else:
            test_data = pd.read_csv('./dataset/test_data.csv')
            self.skus = test_data['sku'].values
            print(self.skus)
        
        
        self.df = df
        self.mode = mode
        gc.collect()
        self.inds = {}
        
        for which in ['first','last']:
            self.inds[which] = df['sku'].drop_duplicates(keep=which)\
                                               .squeeze().to_dict()
            inc = 1 if which == 'last' else 0
            
            self.inds[which] = dict([(v,k+inc) for k,v in self.inds[which].items()])
        
        self.precompute = precompute
        if precompute == True:
            self.L =  len(self.skus) * [None]
        
    def __len__(self):
        return len(self.skus)
    
    def convert_to_accumulative(self,days_to_stockout,total_days=30):
        target = torch.zeros(total_days).cuda()
        target[days_to_stockout] = 1.0
        return torch.cumsum(target,dim=0).view(1,total_days)
        
    def __getitem__(self, idx):
        og_idx = idx
        if self.precompute and (not self.L[og_idx] is None):
            return self.L[og_idx]
        
        idx = self.skus[idx]
        first, last = self.inds['first'][idx], self.inds['last'][idx]
        df = self.df.iloc[first:last,:].copy()
        df['dotw'] = (df['date'].dt.dayofweek.values)/6.0 - 0.5
        
        doms = np.array(df['item_domain_id'].values)
        doms[df['item_domain_id'] < 0] = self.max_domain+1
        df['item_domain_id'] = doms
        df['item_domain_id'] = df['item_domain_id'].fillna(self.max_domain+1)
        
        if self.mode == 'test':
            """
                We can use the "stock" attribute. The test date begins Thursday, 1st Apr
            """
            stock = df['stock'].values[0]
            assert stock > 0
            
            val_end = pd.to_datetime('2021-03-31T00:00:00.000000000')
            is_train =  (val_end - pd.to_datetime(df['date'])).dt.days < 24
            train = df[is_train].copy()
            
        else:
            """ the training data is [1st Feb, 24th Feb, wednesday].
                the validation data is [25th Feb - thursday, 31st of March]
            """
            train_init = pd.to_datetime('2021-02-01T00:00:00.000000000')
            val_init = pd.to_datetime('2021-02-25T00:00:00.000000000')
            
            is_train =  (pd.to_datetime(df['date']) - train_init).dt.days < 24
            is_val = ~is_train & (( pd.to_datetime(df['date']) - val_init ).dt.days < 30)
            train = df[is_train].copy()
            val = df[is_val].copy()
            val['days_passed'] =  (pd.to_datetime(df['date']) - val_init).dt.days
            val['cumsum'] = val['sold_quantity'].cumsum()
            
            if (train.shape[0]==0) or (not val.shape[0] == 30) or (val['cumsum'].values[0] ==val['cumsum'].values[-1]):
                """
                    Useless sample for validation. Return some other (hopefully good sample instead)
                """
                #print("bad")
                return self.__getitem__(np.random.randint(10))
            
            if df['stock'].values[0] < 0:
                """ We take the last day with nonzero sold quantity.
                    Then, the stock is the amount of items sold from the 
                    beginning of the validation month till this day.
                """
                return self.__getitem__(np.random.randint(10))
            else:
                """ The stock is the same as given in the test data. If our
                    task in the test data is to predict how long till x units of
                    product y are sold, then we count how many days in the validation
                    month it took to produce that amount.
                """
                stock = df['stock'].values[0]
                if val[val['cumsum']>=stock].shape[0] == 0:
                    return self.__getitem__(np.random.randint(self.__len__()))
                
                days_to_stockout = int(val[val['cumsum']>=stock]\
                                    ['days_passed'].values[0])
                
                
            
            
        train['log_stock'] = np.log(stock) #Add a column encoding stock??    

        data = Data()
        data_cols = [k for k in train.columns if not k in ['sku','stock','date','fold','is_val','item_domain_id'] ]
        #print(data_cols)
        for c in data_cols:
            data[c] =  torch.as_tensor(train[c].values).cuda()
            
        
        data.shipping_logistic_type = F.one_hot(data.shipping_logistic_type,3)
        data.currency = F.one_hot(data.currency,4)
        data.sold_quantity = torch.clip((data.sold_quantity /  stock),0.0,1.0)
        def maybe_squeeze(x):
            if len(list(x.shape))==1:
                return x.unsqueeze(-1)
            else:
                return x
        data.X  = torch.cat([maybe_squeeze(getattr(data,c)) for c in data_cols],axis=1)
        for c in data_cols:
            delattr(data,c)
        data.domain = torch.IntTensor([train['item_domain_id'].values]).cuda().view(-1,1)
        data.sku = torch.IntTensor(train['sku'].values).cuda().view(-1,1)
        
        """
            Add padding to X, domain, sku. Also add extra dimension at the beginning
        """
        m = data.domain.shape[0]
        for k in ['X','domain']:
            
            if m < 30:
                setattr(data,k, torch.cat([getattr(data,k),
                                    torch.zeros(30 - m,getattr(data,k).shape[-1],
                                        device=getattr(data,k).device)],axis=0))
            setattr(data,k,getattr(data,k).unsqueeze(0))
        data.sku = data.sku[0]
        
        if not self.mode == 'test':
            data.target = self.convert_to_accumulative(days_to_stockout)
            data.target = data.target.float()
            
        data.X = data.X.float()
        data.sku = data.sku.int()
        
        if self.precompute:
            self.L[og_idx] = data
        return data    

datasets = {"train":MeLiDataset(df,mode='train'),
            "val":MeLiDataset(df,mode='val'),
            "test":MeLiDataset(df,mode='test',precompute=True),
           }

batch_size = 32
dataloaders = {"train":DataLoader(datasets['train'],batch_size=batch_size,shuffle=True),
            "val":DataLoader(datasets['val'],batch_size=batch_size,shuffle=True),
            "test":DataLoader(datasets['test'],batch_size=batch_size),
           }
for k in dataloaders.keys():
    print(f"Length of {k} dataset : {len(datasets[k])}")



[464801 645793  99516 ... 170355 246568  49718]
Length of train dataset : 496197
Length of val dataset : 55275
Length of test dataset : 551472


In [4]:
batch_size = 32
dataloaders = {"train":DataLoader(datasets['train'],batch_size=batch_size,shuffle=False),
            "val":DataLoader(datasets['val'],batch_size=batch_size,shuffle=False),
            "test":DataLoader(datasets['test'],batch_size=batch_size),
           }
from tqdm import trange,tqdm
for mode in ['val']:
    for i, data in tqdm(enumerate(dataloaders[mode]),total=len(dataloaders[mode])):
        continue

    import os
    os.makedirs('./dataset/converted',exist_ok=True)
    torch.save([x for x in datasets[mode].L if not x is None],f'./dataset/converted/{mode}.pt')


100%|██████████| 1728/1728 [06:59<00:00,  4.12it/s]


In [5]:
assert all([[not x is None for x in datasets[mode].L ]])

In [2]:
batch_size = 128
dataloaders = {"train":DataLoader(train_L,batch_size=batch_size,shuffle=True),
            "val":DataLoader(5*val_L,batch_size=batch_size,shuffle=False),
            "test":DataLoader(test_L,batch_size=batch_size),
           }


In [25]:
from tqdm import tqdm,trange
from torch.nn import MSELoss
import numpy as np

#model = ChallengeModel().cuda()
#model.load_state_dict(torch.load('./model/LSTM2/model_25193.pt'))


torch.cuda.empty_cache()
L = []
model.eval()
for i,sample in tqdm(enumerate(dataloaders['test']),
                         total=len(dataloaders['test']) ):
    out = model(sample)
    L.append(out.clone().detach().cpu())
L = torch.cat(L,axis=0)
L = L.cpu().numpy()
L = np.round(L,decimals=3)
print(L.shape)
print(L)    

100%|██████████| 4309/4309 [00:15<00:00, 282.67it/s]


(551472, 30)
[[0.001 0.004 0.009 ... 0.026 0.018 0.015]
 [0.001 0.003 0.006 ... 0.034 0.023 0.019]
 [0.084 0.174 0.135 ... 0.004 0.002 0.002]
 ...
 [0.002 0.005 0.005 ... 0.057 0.063 0.043]
 [0.029 0.034 0.04  ... 0.021 0.01  0.01 ]
 [0.002 0.003 0.003 ... 0.06  0.084 0.051]]
