---
### Reading in data and preparing it for modelling

In [2]:
import numpy as np
import pandas as pd

In [3]:
dft = pd.read_csv("train.csv")
tdf = pd.read_csv("test.csv")

In [4]:
print(dft.info())
print("--------")
print(tdf.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37748 entries, 0 to 37747
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Customer_ID         37748 non-null  object
 1   Gender              37748 non-null  object
 2   Age                 37748 non-null  int64 
 3   Vintage             37748 non-null  int64 
 4   Is_Active           37748 non-null  int64 
 5   City_Category       37748 non-null  object
 6   Customer_Category   37748 non-null  object
 7   Product_Holding_B1  37748 non-null  object
 8   Product_Holding_B2  37748 non-null  object
dtypes: int64(3), object(6)
memory usage: 2.6+ MB
None
--------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20327 entries, 0 to 20326
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Customer_ID         20327 non-null  object
 1   Gender              20327 non-null  object


In [5]:
def check_null(df):
    for col in df.columns:
        print(f"{col}: {sum(dft[col].isna())}")
check_null(dft)
print("---------")
check_null(tdf)

Customer_ID: 0
Gender: 0
Age: 0
Vintage: 0
Is_Active: 0
City_Category: 0
Customer_Category: 0
Product_Holding_B1: 0
Product_Holding_B2: 0
---------
Customer_ID: 0
Gender: 0
Age: 0
Vintage: 0
Is_Active: 0
City_Category: 0
Customer_Category: 0
Product_Holding_B1: 0


In [6]:
print(dft.nunique())
print("-------")
print(tdf.nunique())

Customer_ID           37748
Gender                    2
Age                      36
Vintage                  64
Is_Active                 2
City_Category             2
Customer_Category         3
Product_Holding_B1      617
Product_Holding_B2      495
dtype: int64
-------
Customer_ID           20327
Gender                    2
Age                      36
Vintage                  63
Is_Active                 2
City_Category             2
Customer_Category         3
Product_Holding_B1      510
dtype: int64


In [8]:
cat_cols = ['Gender', 'City_Category', 'Customer_Category']

dfu = dft.drop(columns=['Product_Holding_B1', 'Product_Holding_B2', 'Customer_ID'])
dfu = pd.get_dummies(dfu, columns=cat_cols, prefix=cat_cols).astype(np.float32)

tdu = tdf.drop(columns=['Product_Holding_B1', 'Customer_ID'])
tdu = pd.get_dummies(tdu, columns=cat_cols, prefix=cat_cols).astype(np.float32)

In [22]:
from ast import literal_eval as le
def interact(ser):
    nop = 22
    nou = ser.shape[0]
    int_arr = np.zeros((nou, nop), dtype=np.float32)
    for i in range(nou):
        lst = le(ser[i])
        for prod in lst:
            idx = int(prod[1:])
            int_arr[i][idx] = 1
    
    print(int_arr.shape)
    return int_arr


dfi = interact(dft['Product_Holding_B1'])
dfr = interact(dft['Product_Holding_B2'])
tdi = interact(tdf['Product_Holding_B1'])

(37748, 22)
(37748, 22)
(20327, 22)


In [23]:
# check for any issues through shapes

assert dfi.shape[0]==dfu.shape[0]
assert dfi.shape[1]==22
assert dfr.shape[0]==dfu.shape[0]
assert dfr.shape[1]==22
assert tdi.shape[0]==tdf.shape[0]
assert tdi.shape[1]==22
assert dfu.shape[1]==10
assert tdu.shape[1]==10

---
### Preparing and training model

In [13]:
from lightfm import LightFM
from scipy import sparse

In [37]:
alpha = 1e-3
lf_model = LightFM(loss='warp', user_alpha=1e-3)

tot_int = np.concatenate((dfi,tdi), axis=0)
tot_user = dfu.append(tdu)
interact = sparse.coo_matrix(tot_int)
user_feat = sparse.csr_matrix(tot_user.to_numpy(dtype=np.float32))
lf_model.fit_partial(interact, user_features=user_feat, epochs=10, num_threads=2, verbose=True)

Epoch: 100%|██████████| 10/10 [00:02<00:00,  3.58it/s]


<lightfm.lightfm.LightFM at 0x7f186c2c3e20>

---
### Testing and re-training model based on values of train set

In [26]:
from lightfm.evaluation import precision_at_k as MAP

In [43]:
# testing

test_interact =  sparse.csr_matrix(dfr)
train_interact = sparse.csr_matrix(dfi)
test_user_feat = sparse.csr_matrix(dfu.to_numpy(dtype=np.float32))

test_prec = MAP(lf_model, test_interact,
               train_interactions=train_interact, k=5,
               user_features=test_user_feat, num_threads=2, check_intersections=False)

print(test_prec.mean())

0.20912898


In [42]:
# re-training

ret_interact = sparse.coo_matrix(dfr)
lf_model.fit_partial(ret_interact, user_features=test_user_feat, epochs=10, num_threads=2, verbose=True)

Epoch: 100%|██████████| 10/10 [00:01<00:00,  6.07it/s]


<lightfm.lightfm.LightFM at 0x7f186c2c3e20>

---
### Preparing submissions

In [60]:
pred_array = np.logical_not(tdi).astype(np.float32)    
pred_interact = sparse.csr_matrix(pred_array)
pred_train_interact = sparse.csr_matrix(tdi)
pred_user_feat = sparse.csr_matrix(tdu.to_numpy(dtype=np.float32))
pred = lf_model.predict_rank(pred_interact, user_features=pred_user_feat, num_threads=2)

In [68]:
ranks = pred.toarray().astype(np.int)

In [69]:
ranks[:5]

array([[ 2, 10, 12,  8,  7, 11,  4,  5,  0,  9,  6, 15,  0,  0, 14, 13,
        18, 20, 19, 21, 17, 16],
       [ 3,  1, 14,  4,  8, 11,  6,  5,  0, 10,  9, 17,  0,  0, 13, 12,
        21, 20, 15, 19, 16, 18],
       [ 4, 13, 14, 11,  6, 10,  3,  7,  0,  8,  5, 12,  2,  9, 19, 16,
         1, 15, 21, 18,  0, 17],
       [ 3, 12, 14, 13,  7, 10,  4,  8,  1,  6,  5, 11,  2,  0, 19, 16,
         0, 15, 21, 18, 20,  0],
       [ 2,  3, 14,  5,  9, 11,  6,  7,  0,  8, 10, 17,  1,  0, 13, 12,
        21, 20, 15, 19, 18, 16]])

In [57]:
pred_array[:5]

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 0., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
        1., 1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
        1., 1., 1., 1., 1., 1.]], dtype=float32)

In [59]:
pred_train_interact.todense()[:5]

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 0., 0., 0.]], dtype=float32)