In [57]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
load_cols = ['item_bought', 
             'most_viewed', 
             'price_most_viewed', 'condition_most_viewed', 'times_most_viewed',
             'last_viewed', 
             'price_last_viewed', 'condition_last_viewed']

df_train = pd.read_parquet("../data/interim/train_dataset_0.parquet", columns=load_cols)
df_test = pd.read_parquet("../data/interim/test_dataset_0.parquet", columns=load_cols[1:])

In [3]:
df_train.head()

Unnamed: 0,item_bought,most_viewed,price_most_viewed,condition_most_viewed,times_most_viewed,last_viewed,price_last_viewed,condition_last_viewed
0,394965,626904,149.99,new,5,1,225.9,new
1,492271,15,140.0,new,7,15,140.0,new
2,1717880,381867,899.0,new,2,21,15000.0,used
3,33,33,166.92,new,3,33,166.92,new
4,33,5896,385.0,new,4,33,166.92,new


In [4]:
df_train.tail()

Unnamed: 0,item_bought,most_viewed,price_most_viewed,condition_most_viewed,times_most_viewed,last_viewed,price_last_viewed,condition_last_viewed
413158,1792329,2102277,,,0,2102277,,
413159,413898,2102277,,,0,2102277,,
413160,905874,2102277,,,0,2102277,,
413161,2022477,2102277,,,0,2102277,,
413162,1111021,2102277,,,0,2102277,,


# Categorical data

In [5]:
df_train['condition_last_viewed'].unique()

['new', 'used', 'not_specified', NaN]
Categories (3, object): ['new', 'used', 'not_specified']

In [6]:
df_train['condition_last_viewed'] = (df_train['condition_last_viewed']
                                     .map({'new': 0, 'used': 1, 'not_specified': 2}))

In [7]:
df_train['condition_most_viewed'] = (df_train['condition_most_viewed']
                                     .map({'new': 0, 'used': 1, 'not_specified': 2}))

In [8]:
cat_cols = ['item_bought', 'most_viewed', 'last_viewed', 'condition_most_viewed', 'condition_last_viewed']

## Missing values

LightGBM:
> all negative values will be treated as missing values

In [9]:
df_train['most_viewed'] = df_train['most_viewed'].replace(to_replace=2102277, value=-1)
df_train['last_viewed'] = df_train['last_viewed'].replace(to_replace=2102277, value=-1)

In [10]:
num_class = df_train['item_bought'].nunique()
print(num_class)

64928


Too many categories.

Idea: create a pipeline of models.

First, estimate the bought item domain. Then, try estimating the bought item itself, by calling another model trained only with data from that domain.

In [11]:
df_item = pd.read_parquet("../data/interim/item_data.parquet", columns=['item_id','domain_id'])

In [12]:
df_item['domain_id'].nunique()

7893

That's better. Still not ideal though.

NOTE:

> For a categorical feature with high cardinality (#category is large), it often works best to treat the feature as numeric, either by simply ignoring the categorical interpretation of the integers or by embedding the categories in a low-dimensional numeric space.

https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html

Wil treat the `item_id` categorical variable as numerical as per recommendation.

Also, I'll try to tackle this as a regression problem then.

In [13]:
domain_mapper = {x[0]: x[1] for x in df_item[['item_id','domain_id']].values}

In [14]:
df_train['domain_id_most_viewed'] = df_train['most_viewed'].map(domain_mapper)
df_train['domain_id_last_viewed'] = df_train['last_viewed'].map(domain_mapper)
df_train['domain_id_item_bought'] = df_train['item_bought'].map(domain_mapper)

In [15]:
domain_cat_mapper = {x[1]: int(x[0]) for x in enumerate(df_item['domain_id'].unique())}

In [16]:
df_train['domain_id_cat_most_viewed'] = df_train['domain_id_most_viewed'].map(domain_cat_mapper)
df_train['domain_id_cat_last_viewed'] = df_train['domain_id_last_viewed'].map(domain_cat_mapper)
df_train['domain_id_cat_item_bought'] = df_train['domain_id_item_bought'].map(domain_cat_mapper)

In [17]:
df_train

Unnamed: 0,item_bought,most_viewed,price_most_viewed,condition_most_viewed,times_most_viewed,last_viewed,price_last_viewed,condition_last_viewed,domain_id_most_viewed,domain_id_last_viewed,domain_id_item_bought,domain_id_cat_most_viewed,domain_id_cat_last_viewed,domain_id_cat_item_bought
0,394965,626904,149.99,0,5,1,225.90,0,MLB-NETWORK_PLUGS,MLB-BOOTS_AND_BOOTIES,MLB-PRINTER_INKS,4791.0,2173.0,2107
1,492271,15,140.00,0,7,15,140.00,0,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,MLM-CELLPHONE_SCREENS,17.0,17.0,327
2,1717880,381867,899.00,0,2,21,15000.00,1,MLB-CARPET_AND_UPHOLSTERY_CLEANERS,MLB-BUSES,MLB-HARD_DRIVES_AND_SSDS,4955.0,2622.0,2361
3,33,33,166.92,0,3,33,166.92,0,MLB-INFLATABLE_BALL_PITS,MLB-INFLATABLE_BALL_PITS,MLB-INFLATABLE_BALL_PITS,2984.0,2984.0,2984
4,33,5896,385.00,0,4,33,166.92,0,MLB-INFLATABLE_BALL_PITS,MLB-INFLATABLE_BALL_PITS,MLB-INFLATABLE_BALL_PITS,2984.0,2984.0,2984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413158,1792329,-1,,,0,-1,,,,,MLB-ACTION_FIGURES,,,1538
413159,413898,-1,,,0,-1,,,,,MLB-SPORT_WATCHES,,,3056
413160,905874,-1,,,0,-1,,,,,MLB-WALL_AND_CEILING_LIGHTS,,,1962
413161,2022477,-1,,,0,-1,,,,,MLB-CELLPHONE_BATTERIES,,,2393


# Training

In [18]:
train_cols = ['most_viewed', 
              'price_most_viewed', 'condition_most_viewed', 'times_most_viewed',
              'domain_id_cat_most_viewed',
              'last_viewed', 
              'price_last_viewed', 'condition_last_viewed',
              'domain_id_cat_last_viewed']

In [19]:
X, y = df_train[train_cols].values, df_train['domain_id_cat_item_bought'].values

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
num_train, num_feature = X_train.shape

In [22]:
lgb_train = lgb.Dataset(X_train, y_train,
                        free_raw_data=False)
#lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
#                       free_raw_data=False)

In [37]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [38]:
feature_name = train_cols

In [39]:
list(filter(lambda s: s[1] in train_cols, enumerate(df_train.columns.values)))

[(1, 'most_viewed'),
 (2, 'price_most_viewed'),
 (3, 'condition_most_viewed'),
 (4, 'times_most_viewed'),
 (5, 'last_viewed'),
 (6, 'price_last_viewed'),
 (7, 'condition_last_viewed'),
 (11, 'domain_id_cat_most_viewed'),
 (12, 'domain_id_cat_last_viewed')]

In [40]:
#cat_cols_idx = [0,2,4,6,10,11]
cat_cols_idx = [2,6]

In [41]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1_000,
                valid_sets=lgb_train,  # eval training data
                feature_name=feature_name,
                categorical_feature=cat_cols_idx)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	training's l1: 812.642	training's l2: 1.26177e+06
[2]	training's l1: 797.137	training's l2: 1.20892e+06
[3]	training's l1: 782.542	training's l2: 1.16116e+06
[4]	training's l1: 769.62	training's l2: 1.11994e+06
[5]	training's l1: 757.62	training's l2: 1.0827e+06
[6]	training's l1: 745.516	training's l2: 1.0469e+06
[7]	training's l1: 734.667	training's l2: 1.0157e+06
[8]	training's l1: 724.056	training's l2: 986351
[9]	training's l1: 714.049	training's l2: 959742
[10]	training's l1: 704.762	training's l2: 935720
[11]	training's l1: 695.995	training's l2: 913890
[12]	training's l1: 687.815	training's l2: 894155
[13]	training's l1: 680.266	training's l2: 876278
[14]	training's l1: 673.156	training's l2: 860149
[15]	training's l1: 666.572	training's l2: 845499
[16]	training's l1: 660.637	training's l2: 832725
[17]	training's l1: 654.831	training's l2: 820690
[18]	tr

KeyboardInterrupt: 

In [42]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

In [45]:
y_pred[:100]

array([2456.31978142,  255.18189674, 2203.01249794, 2206.18746013,
       2414.90318256, 2283.21119913,  460.4425943 , 2229.8885541 ,
       3488.19531162, 2745.05826972, 3620.53762108,  238.27398956,
       1714.98792315, 2914.65825169, 2252.21314566,  467.31368468,
       1800.84252553, 2506.26993063, 2753.70231086, 2710.98877384,
       3216.82657056, 2393.14384834, 2229.8885541 , 2175.69295265,
       2417.92368881,  330.04290142, 1719.31987332, 2229.8885541 ,
        213.82863994, 2827.42603336,  301.94700455, 2162.57797751,
        359.84134639, 1894.27859932, 2208.15547055, 3024.00973588,
       2567.56405996, 2747.72288116, 2080.65797027, 1864.75754967,
       2588.86769686, 2149.18307214, 1854.0370314 ,  317.79381339,
        153.54064976, 1721.74209783,   84.38706545, 2996.38510711,
       2186.05367975, 3356.31091305, 3390.55534284, 2331.70416031,
       2229.8885541 ,  315.29948478, 2860.26704102, 1750.08100899,
       2454.99308513, 2136.84868961, 2696.65628444, 2235.08455

In [47]:
y_test[:100]

array([2124, 1430, 1752, 1896, 1531, 2180,  164, 3214, 2442, 3076, 2095,
         99, 1594, 2871, 1531,  225, 1561, 3506, 2827, 1531, 1559, 1759,
       1313, 1891, 2042,   99, 1561, 1601,  265, 2035,   28, 2035,  209,
       1869, 2173, 3229, 1752, 2173, 1531, 2229, 2048, 2271, 1594,  193,
        920, 1561,   47, 3727, 2565, 3344, 1846, 2035, 4052,  274, 1542,
       6999, 2361, 2173, 1597, 2173,   28,   39, 2356, 2361, 3503, 1752,
       3381, 2052, 1531, 1752, 2561, 2052,  200, 1594, 1595, 2035, 1543,
         40, 3229, 2874, 1759, 1601, 2041,   44, 1594,  247,  590, 2372,
       4400, 1594, 2100, 2666, 1597,  409, 2429, 2035, 1594, 1869, 2124,
       3414])

Quantidade de "acertos":

In [55]:
len(list(filter(lambda x: -1 < x < 1, y_test - np.around(y_pred))))

88

Porcentagem:

In [56]:
100*(len(list(filter(lambda x: -1 < x < 1, y_test - np.around(y_pred))))/len(y_pred))

0.08519619327918211

# Regressão Linear

In [18]:
train_cols = ['most_viewed', 
              'price_most_viewed', 'condition_most_viewed', 'times_most_viewed',
              'domain_id_cat_most_viewed',
              'last_viewed', 
              'price_last_viewed', 'condition_last_viewed',
              'domain_id_cat_last_viewed']

In [63]:
X, y = (df_train.dropna(how='any')
        [train_cols].values,
        df_train.dropna(how='any')
        ['domain_id_cat_item_bought'].values)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
clf = LogisticRegression(random_state=42, class_weight='balanced', multi_class='multinomial')
clf.fit(X_train, y_train)

In [59]:
X_train

array([[1957183, '289.00', 0.0, ..., '299.00', 0.0, 34.0],
       [1362338, '594.00', 0.0, ..., '199.00', 0.0, 109.0],
       [1379831, '39.50', 0.0, ..., '55.00', 0.0, 2317.0],
       ...,
       [1398369, '239.90', 0.0, ..., '120.00', 0.0, 1531.0],
       [1453228, '499.00', 0.0, ..., '78.99', 0.0, 21.0],
       [-1, None, nan, ..., None, nan, nan]], dtype=object)