In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm_notebook as tqdm

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

In [2]:
files_dir = 'data/'

orders = pd.read_csv(files_dir + 'kaggle_tab_1345/tab_1_orders.csv')
users = pd.read_csv(files_dir + 'kaggle_tab_1345/tab_4_user_profiles.csv')
product_properties = pd.read_csv(files_dir + 'kaggle_tab_1345/tab_5_product_properties.csv')
categories = pd.read_csv(files_dir + 'kaggle_tab_1345/tab_3_categories.csv')
submission = pd.read_csv(files_dir + 'sample_submission.csv')
cities = pd.read_csv(files_dir + 'tab_6_city.csv')

In [4]:
data = pd.read_csv(files_dir + 'merged_data.csv')

In [5]:
order_id2retailer = dict(zip(orders.order_id.values, orders.retailer.values))

In [6]:
top_retailers = set(orders.retailer.value_counts()[:13].keys())

retailer2item_to_id = {}
retailer2id_to_item = {}
retailer2user_to_id = {}
retailer2id_to_user = {}
retailer2cf_matrix = {}

for retailer in top_retailers:
    retailer2item_to_id[retailer] = {}
    retailer2id_to_item[retailer] = {}
    retailer2user_to_id[retailer] = {}
    retailer2id_to_user[retailer] = {}
    retailer2cf_matrix[retailer] = np.zeros((len(users), product_properties.product_id.nunique()))

In [9]:
for i, row in tqdm(data[np.isin(data.order_id.values, list(order_id2retailer.keys()))].iterrows()):
    order_id = row['order_id']

    user_id = row['user_id']
    item_id = row['product_id']
    
    retailer = order_id2retailer[order_id]
        
    if retailer not in top_retailers:
        continue
    
    if user_id not in retailer2user_to_id[retailer]:
        id = len(retailer2user_to_id[retailer])
        retailer2user_to_id[retailer][user_id] = id
        retailer2id_to_user[retailer][id] = user_id
    
    uid = retailer2user_to_id[retailer][user_id]

    if item_id not in retailer2item_to_id[retailer]:
        id = len(retailer2item_to_id[retailer])
        retailer2item_to_id[retailer][item_id] = id
        retailer2id_to_item[retailer][id] = item_id

    iid = retailer2item_to_id[retailer][item_id]

    retailer2cf_matrix[retailer][uid][iid] = 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [35]:
for retailer in top_retailers:
    n_users = len(retailer2user_to_id[retailer])
    n_items = len(retailer2item_to_id[retailer])
    retailer2cf_matrix[retailer] = retailer2cf_matrix[retailer][:n_users, :n_items]

In [28]:
retailer2svd = {}

In [36]:
for retailer in tqdm(top_retailers):
    retailer2cf_matrix[retailer] = csr_matrix(retailer2cf_matrix[retailer])
    U, sigma, Vt = svds(retailer2cf_matrix[retailer], k=int(retailer2cf_matrix[retailer].shape[0] / 30))
    retailer2svd[retailer] = (U, sigma, Vt)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




In [119]:
for retailer in tqdm(top_retailers):
    retailer2cf_matrix[retailer] = retailer2cf_matrix[retailer].toarray()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




In [62]:
for retailer in tqdm(top_retailers):
    U, sigma, Vt = retailer2svd[retailer]
    retailer2svd[retailer] = np.dot(np.dot(U, np.diag(sigma)), Vt)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




In [123]:
submission = pd.read_csv('data/sample_submission.csv')
emils_submission = pd.read_csv('subm_most_popular_per_user_plus_fill.csv')
my_submission = submission.copy()

In [137]:
for i, user_id in tqdm(enumerate(submission.Id)):
    orders_by_user = orders[orders.user_id == user_id]
    emils_prediction = list(map(int, emils_submission[emils_submission.id == user_id]['Predicted'].values[0].split(' ')))
    
    if len(orders_by_user) == 0:
        continue
        my_submission.iloc[i, 1] = ' '.join(map(str, emils_prediction))
    else:
        retailer_counts = orders_by_user.retailer.value_counts()
        top_retailer, top_retailer_count = next(retailer_counts.items())
        if top_retailer != 'METRO':
            my_submission.iloc[i, 1] = ' '.join(map(str, emils_prediction))
            continue
        top_retailer_percentage = top_retailer_count / len(orders_by_user)
        
        preds = emils_prediction[:40]
        
        if top_retailer_percentage > 0.5:
            if top_retailer in top_retailers:
                if user_id in retailer2user_to_id[top_retailer]:
                    uid = retailer2user_to_id[top_retailer][user_id]
                    items_ratings = retailer2svd[top_retailer][uid]
                    items_indices_sorted = np.argsort(items_ratings)[::-1]
                    users_items = retailer2cf_matrix[retailer][uid]

                    items_added = 0
                    ind = 0
                    while items_added < 10:
                        iid = items_indices_sorted[ind]
                        if users_items[iid] == 0:
                            preds.append(retailer2id_to_item[retailer][iid])
                            items_added += 1
                        ind += 1

                    my_submission.iloc[i, 1] = ' '.join(map(str, preds))
                    continue
        
        preds.extend(emils_prediction[40:])
        my_submission.iloc[i, 1] = ' '.join(map(str, preds))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [138]:
my_submission.to_csv('emils_with_10cf.csv', index=False)