In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sps
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

In [3]:
products = pd.read_csv('/home/ashwin/Downloads/hackerearth/h1/recommend/products.csv')

In [4]:
sample = pd.read_csv('/home/ashwin/Downloads/hackerearth/h1/recommend/sampleSubmission.csv')

In [5]:
sample.head()

Unnamed: 0,customerID,products
0,BBID_204221,"300663432,1000099534,1000475598,None,None,None..."
1,BBID_204254,"300663432,1000099534,1000475598,None,None,None..."
2,BBID_204830,"300663432,1000099534,1000475598,None,None,None..."
3,BBID_204880,"300663432,1000099534,1000475598,None,None,None..."
4,BBID_204910,"300663432,1000099534,1000475598,None,None,None..."


In [6]:
'BBID_20410043' in sample['customerID']

False

In [7]:
products['customerID'].nunique()

165055

In [8]:
products['transactionDate'] = pd.to_datetime(products['transactionDate'])
mask = (products['transactionDate'] >= '2016-12-01')
products = products.loc[mask]

In [9]:
products.sort_values('transactionDate',inplace=True)
products = products.reset_index(drop=True)

In [10]:
## take only those customers which are in sample submission file
products_2 = products[products['customerID'].isin(sample['customerID'])]

## remove missing values # 4
products_2 = products_2[~pd.isnull(products_2['product_code'])]

## convert type of product code
products_2['product_code'] = products_2['product_code'].astype(np.int64)

In [11]:
products_2 = products_2.loc[:,['customerID','product_code']]
products_2 = products_2.reset_index(drop=True)

In [12]:
## these customers are not in train, so we'll predict None for them at last
misfit_customers = list(set(sample['customerID']) - set(products_2['customerID']))

In [13]:
## create product list by customers
products_2 = products_2.groupby('customerID')['product_code'].apply(lambda x: x.tolist()).reset_index()

In [14]:
## remove duplicate products
products_2['product_code'] = products_2['product_code'].map(lambda x: 
                                                            list(set(x)))

In [15]:
## fix product max len to 20 (we'll pick the last 20 i.e most recent ones)
products_2['product_code'] = products_2['product_code'].map(lambda x: x[-20:])

In [16]:
customerIDs = []
product_codes = []

for index, row in products_2.iterrows():
    #if index % 10 == 0:
    #    print (index)
    ls_len = len(row['product_code'])
    customerIDs.extend(np.repeat(row['customerID'], ls_len))
    product_codes.extend(row['product_code'])

In [17]:
from sklearn.preprocessing import LabelEncoder

lbl = LabelEncoder() ## for customers
customerIDs = lbl.fit_transform(customerIDs)
 
lbl2 = LabelEncoder() ## for products
product_codes = lbl2.fit_transform(product_codes)

In [18]:
n_unique_users = len(set(customerIDs))
n_unique_products = len(set(product_codes))

row = customerIDs
col = product_codes

vals = np.repeat(1, len(row))

## this matrix has information about a user bought which all products.
user_product_matrix = sps.csr_matrix((vals, (row, col)), shape=(n_unique_users, n_unique_products))

## this matrix has information a product got bought along with which other products, and how many times
product_cooccurence_matrix = (user_product_matrix.T * user_product_matrix)

## set diagonal equals to zero since we are not interested in knowing the count of a product with itself
product_cooccurence_matrix.setdiag(np.repeat(0, product_cooccurence_matrix.shape[0]))

In [19]:
product_summary = pd.DataFrame({'customerID':customerIDs, 'product_code':product_codes})
product_summary = product_summary.groupby('customerID')['product_code'].agg(lambda x:x.tolist()).reset_index().rename(columns = {0:'product_collection'})

In [20]:
def take_top_(x):
    if x >= 20:
        return 1
    else:
        if x < 20:
            return int(np.round(20/x))
        
## get count of products per customer
product_summary['len_collection'] = product_summary['product_collection'].map(len)

## if a customer has 20 products in the list, we'll take 1 top most product per each product
product_summary['take_top'] = product_summary['len_collection'].map(lambda x: take_top_(x))

In [21]:
def recommend_affinity(user):
    
    products_ = product_summary[product_summary['customerID'] == user]['product_collection'].iloc[0] # returns list
    take_top_ = product_summary[product_summary['customerID'] == user]['take_top'].iloc[0]
    
    recs = []
    
    if take_top_ == 0:
        return recs
    
    ## here we get the list of products which were bought the maximum number of times along with a particular productr
    for tt in products_:
        s = np.squeeze(np.asarray(product_cooccurence_matrix[tt].todense())) ## list of products bought with counts` with that product
        ll = s.argsort()[-int(take_top_):][::-1]
        recs.append(list(ll))

    recs = list(np.vstack(recs).flatten('F'))
 
    if not recs:
        return recs
    
    return recs

In [22]:
unique_customers = []

for i in sample['customerID']:
    if i not in misfit_customers:
        unique_customers.append(i)

unique_customers = lbl.fit_transform(unique_customers)

In [23]:
from collections import defaultdict
out_dict = defaultdict(list)

nulls = []

for user in tqdm(unique_customers): #[:50]): ## i took first 50 customers. 
    rec = recommend_affinity(user)
    if not rec:
        nulls.append(user)
    out_dict[user] = rec
#     if i% 10 == 0:
#         print(i)

  0%|          | 0/25754 [00:00<?, ?it/s]

  0%|          | 1/25754 [00:00<1:22:11,  5.22it/s]

  0%|          | 2/25754 [00:00<3:00:12,  2.38it/s]

  0%|          | 3/25754 [00:01<3:22:08,  2.12it/s]

  0%|          | 4/25754 [00:01<3:30:04,  2.04it/s]

  0%|          | 5/25754 [00:02<3:30:36,  2.04it/s]

  0%|          | 6/25754 [00:03<3:35:14,  1.99it/s]

  0%|          | 7/25754 [00:03<3:35:33,  1.99it/s]

  0%|          | 8/25754 [00:03<3:17:22,  2.17it/s]

  0%|          | 9/25754 [00:04<3:18:23,  2.16it/s]

  0%|          | 10/25754 [00:04<3:10:14,  2.26it/s]

  0%|          | 11/25754 [00:04<3:12:05,  2.23it/s]

  0%|          | 12/25754 [00:05<3:00:03,  2.38it/s]

  0%|          | 13/25754 [00:05<3:03:24,  2.34it/s]

  0%|          | 14/25754 [00:06<3:06:39,  2.30it/s]

  0%|          | 15/25754 [00:06<3:00:22,  2.38it/s]

  0%|          | 16/25754 [00:06<3:05:10,  2.32it/s]

  0%|          | 17/25754 [00:07<3:04:53,  2.32it/s]

  0%|          | 18/25754 [00:07<3:06:20,  2.30it/s]

  0%|          | 19/25754 [00:08<3:08:24,  2.28it/s]

  0%|          | 20/25754 [00:08<3:09:35,  2.26it/s]

  0%|          | 21/25754 [00:09<3:10:53,  2.25it/s]

  0%|          | 22/25754 [00:09<3:08:28,  2.28it/s]

  0%|          | 23/25754 [00:10<3:10:32,  2.25it/s]

  0%|          | 24/25754 [00:10<3:11:48,  2.24it/s]

  0%|          | 25/25754 [00:11<3:13:38,  2.21it/s]

  0%|          | 26/25754 [00:11<3:11:57,  2.23it/s]

  0%|          | 27/25754 [00:12<3:12:20,  2.23it/s]

  0%|          | 28/25754 [00:12<3:11:35,  2.24it/s]

  0%|          | 29/25754 [00:13<3:13:07,  2.22it/s]

  0%|          | 30/25754 [00:13<3:13:46,  2.21it/s]

  0%|          | 32/25754 [00:14<3:09:38,  2.26it/s]

  0%|          | 33/25754 [00:14<3:11:17,  2.24it/s]

  0%|          | 34/25754 [00:15<3:11:46,  2.24it/s]

  0%|          | 35/25754 [00:15<3:07:39,  2.28it/s]

  0%|          | 36/25754 [00:15<3:08:24,  2.28it/s]

  0%|          | 37/25754 [00:16<3:05:37,  2.31it/s]

  0%|          | 38/25754 [00:16<3:06:21,  2.30it/s]

  0%|          | 39/25754 [00:16<3:04:45,  2.32it/s]

  0%|          | 40/25754 [00:17<3:05:50,  2.31it/s]

  0%|          | 41/25754 [00:17<3:06:41,  2.30it/s]

  0%|          | 43/25754 [00:18<3:04:39,  2.32it/s]

  0%|          | 44/25754 [00:18<3:02:04,  2.35it/s]

  0%|          | 45/25754 [00:19<3:03:27,  2.34it/s]

  0%|          | 46/25754 [00:19<3:04:13,  2.33it/s]

  0%|          | 47/25754 [00:20<3:04:41,  2.32it/s]

  0%|          | 48/25754 [00:20<3:05:17,  2.31it/s]

  0%|          | 49/25754 [00:21<3:06:22,  2.30it/s]

  0%|          | 50/25754 [00:21<3:07:03,  2.29it/s]

  0%|          | 51/25754 [00:22<3:05:18,  2.31it/s]

  0%|          | 52/25754 [00:22<3:06:29,  2.30it/s]

  0%|          | 53/25754 [00:23<3:08:13,  2.28it/s]

  0%|          | 54/25754 [00:23<3:07:14,  2.29it/s]

  0%|          | 55/25754 [00:24<3:09:04,  2.27it/s]

  0%|          | 57/25754 [00:24<3:06:58,  2.29it/s]

  0%|          | 58/25754 [00:25<3:07:39,  2.28it/s]

  0%|          | 59/25754 [00:25<3:08:20,  2.27it/s]

  0%|          | 60/25754 [00:26<3:09:32,  2.26it/s]

  0%|          | 61/25754 [00:27<3:10:47,  2.24it/s]

  0%|          | 62/25754 [00:27<3:11:27,  2.24it/s]

  0%|          | 63/25754 [00:28<3:11:44,  2.23it/s]

  0%|          | 64/25754 [00:28<3:11:57,  2.23it/s]

  0%|          | 65/25754 [00:29<3:12:28,  2.22it/s]

  0%|          | 66/25754 [00:29<3:12:29,  2.22it/s]

  0%|          | 67/25754 [00:30<3:12:59,  2.22it/s]

  0%|          | 68/25754 [00:30<3:13:41,  2.21it/s]

  0%|          | 69/25754 [00:31<3:13:42,  2.21it/s]

  0%|          | 70/25754 [00:31<3:13:59,  2.21it/s]

  0%|          | 71/25754 [00:32<3:13:44,  2.21it/s]

  0%|          | 72/25754 [00:32<3:13:21,  2.21it/s]

  0%|          | 73/25754 [00:32<3:12:05,  2.23it/s]

  0%|          | 74/25754 [00:32<3:10:48,  2.24it/s]

  0%|          | 75/25754 [00:33<3:10:44,  2.24it/s]

  0%|          | 76/25754 [00:33<3:09:04,  2.26it/s]

  0%|          | 77/25754 [00:33<3:08:46,  2.27it/s]

  0%|          | 78/25754 [00:34<3:09:07,  2.26it/s]

  0%|          | 79/25754 [00:35<3:09:50,  2.25it/s]

  0%|          | 81/25754 [00:35<3:06:02,  2.30it/s]

  0%|          | 82/25754 [00:35<3:04:35,  2.32it/s]

  0%|          | 83/25754 [00:35<3:04:55,  2.31it/s]

  0%|          | 84/25754 [00:36<3:03:39,  2.33it/s]

  0%|          | 85/25754 [00:36<3:04:06,  2.32it/s]

  0%|          | 86/25754 [00:37<3:04:14,  2.32it/s]

  0%|          | 87/25754 [00:37<3:04:51,  2.31it/s]

  0%|          | 88/25754 [00:38<3:05:29,  2.31it/s]

  0%|          | 89/25754 [00:38<3:05:51,  2.30it/s]

  0%|          | 90/25754 [00:39<3:06:22,  2.30it/s]

  0%|          | 91/25754 [00:39<3:06:35,  2.29it/s]

  0%|          | 93/25754 [00:40<3:05:12,  2.31it/s]

  0%|          | 94/25754 [00:40<3:05:33,  2.30it/s]

  0%|          | 95/25754 [00:41<3:05:58,  2.30it/s]

  0%|          | 96/25754 [00:41<3:06:18,  2.30it/s]

  0%|          | 97/25754 [00:42<3:06:53,  2.29it/s]

  0%|          | 98/25754 [00:42<3:06:51,  2.29it/s]

  0%|          | 99/25754 [00:43<3:07:01,  2.29it/s]

  0%|          | 101/25754 [00:43<3:06:09,  2.30it/s]

  0%|          | 102/25754 [00:44<3:06:00,  2.30it/s]

  0%|          | 103/25754 [00:44<3:06:18,  2.29it/s]

  0%|          | 104/25754 [00:45<3:06:26,  2.29it/s]

  0%|          | 105/25754 [00:45<3:06:49,  2.29it/s]

  0%|          | 106/25754 [00:46<3:06:04,  2.30it/s]

  0%|          | 107/25754 [00:46<3:05:32,  2.30it/s]

  0%|          | 108/25754 [00:46<3:04:52,  2.31it/s]

  0%|          | 109/25754 [00:47<3:05:25,  2.30it/s]

  0%|          | 110/25754 [00:47<3:05:11,  2.31it/s]

  0%|          | 111/25754 [00:48<3:05:20,  2.31it/s]

  0%|          | 112/25754 [00:48<3:05:53,  2.30it/s]

  0%|          | 113/25754 [00:49<3:05:20,  2.31it/s]

  0%|          | 114/25754 [00:49<3:04:25,  2.32it/s]

  0%|          | 115/25754 [00:49<3:04:45,  2.31it/s]

  0%|          | 116/25754 [00:50<3:04:57,  2.31it/s]

  0%|          | 117/25754 [00:50<3:05:26,  2.30it/s]

  0%|          | 118/25754 [00:51<3:05:40,  2.30it/s]

  0%|          | 119/25754 [00:51<3:05:27,  2.30it/s]

  0%|          | 120/25754 [00:52<3:05:45,  2.30it/s]

  0%|          | 121/25754 [00:52<3:06:18,  2.29it/s]

  0%|          | 122/25754 [00:53<3:06:40,  2.29it/s]

  0%|          | 123/25754 [00:53<3:07:02,  2.28it/s]

  0%|          | 124/25754 [00:54<3:07:05,  2.28it/s]

  0%|          | 125/25754 [00:54<3:07:25,  2.28it/s]

  0%|          | 126/25754 [00:54<3:06:25,  2.29it/s]

  0%|          | 127/25754 [00:55<3:06:52,  2.29it/s]

  0%|          | 128/25754 [00:55<3:06:03,  2.30it/s]

  1%|          | 129/25754 [00:56<3:05:51,  2.30it/s]

  1%|          | 131/25754 [00:56<3:04:34,  2.31it/s]

  1%|          | 132/25754 [00:57<3:04:56,  2.31it/s]

  1%|          | 133/25754 [00:57<3:05:15,  2.30it/s]

  1%|          | 134/25754 [00:58<3:05:37,  2.30it/s]

  1%|          | 135/25754 [00:58<3:05:42,  2.30it/s]

  1%|          | 136/25754 [00:59<3:06:02,  2.30it/s]

  1%|          | 137/25754 [00:59<3:06:20,  2.29it/s]

  1%|          | 138/25754 [01:00<3:06:28,  2.29it/s]

  1%|          | 139/25754 [01:00<3:06:28,  2.29it/s]

  1%|          | 140/25754 [01:01<3:06:46,  2.29it/s]

  1%|          | 141/25754 [01:01<3:05:52,  2.30it/s]

  1%|          | 142/25754 [01:01<3:04:56,  2.31it/s]

  1%|          | 143/25754 [01:02<3:05:09,  2.31it/s]

  1%|          | 144/25754 [01:02<3:05:11,  2.30it/s]

  1%|          | 145/25754 [01:03<3:05:32,  2.30it/s]

  1%|          | 146/25754 [01:03<3:05:58,  2.30it/s]

  1%|          | 147/25754 [01:04<3:06:15,  2.29it/s]

  1%|          | 148/25754 [01:04<3:06:22,  2.29it/s]

  1%|          | 149/25754 [01:04<3:05:30,  2.30it/s]

  1%|          | 150/25754 [01:05<3:05:43,  2.30it/s]

  1%|          | 151/25754 [01:05<3:05:58,  2.29it/s]

  1%|          | 152/25754 [01:06<3:05:30,  2.30it/s]

  1%|          | 153/25754 [01:06<3:05:41,  2.30it/s]

  1%|          | 154/25754 [01:06<3:05:24,  2.30it/s]

  1%|          | 155/25754 [01:07<3:05:44,  2.30it/s]

  1%|          | 156/25754 [01:08<3:06:06,  2.29it/s]

  1%|          | 157/25754 [01:08<3:06:19,  2.29it/s]

  1%|          | 158/25754 [01:09<3:06:33,  2.29it/s]

  1%|          | 159/25754 [01:09<3:06:53,  2.28it/s]

KeyboardInterrupt: 

In [None]:
## get unique items per customer
from collections import defaultdict

out_dict_2 = defaultdict(list)

for k,v in out_dict.items():
    out_dict_2[k] = list(set(v))

In [None]:
customer_codes = list(lbl.inverse_transform(customerIDs))
customer_maps = dict(zip(list(customerIDs), list(customer_codes)))

In [None]:
out_dict_3 = defaultdict(list)

for k,v in out_dict_2.items():
    out_dict_3[customer_maps[k]] = out_dict_2[k]

In [None]:
mis_dict = defaultdict(list)

for i in misfit_customers:
    mis_dict[i] = list(np.repeat('None', 20))

In [None]:
out_dict_3.update(mis_dict)

In [None]:
submission = pd.DataFrame(list(out_dict_3.items()), columns=['customerID','products'])

In [None]:
product_codes_inv = list(lbl2.inverse_transform(product_codes))
product_maps = dict(zip(list(product_codes), list(product_codes_inv)))

In [None]:
submission['products'] = submission['products'].map(lambda x: [product_maps.get(s) for s in x])

In [None]:
for index, row in submission.iterrows():
    max_len = 20
    if len(row['products']) < max_len:
        size = len(row['products'])
        diff_ = max_len - size
        a = row['products']
        a.extend(np.repeat('None', diff_))
        submission.loc[index, 'products'] = a
    else:
        submission.loc[index, 'products'] = row['products'][:20]

In [None]:
submission['products'] = submission['products'].map(lambda x: ','.join(str(e) for e in x))

In [None]:
submission.to_csv('sub_0001.csv', index=False)