In [2]:
!pip install similaripy



In [3]:
import os
import numpy as np
import pandas as pd
import similaripy as sim
from scipy import sparse
from pathlib import Path


def list_files(path):
    return [file for file in os.listdir(path) if not file.startswith('.')]


HOME_DIR = Path('/home/jovyan')
OUTPUT_DIR = HOME_DIR / 'output'
INPUT_PATH = HOME_DIR / 'input'

In [None]:
# User-Wise Popularity Method
def uwPopMat(df_train, n_items, recency=0):
    """
    Calculate the user popularity matrix with the given recency window
    In:
        df_train: train Dataframe
        n_items: #items
    Return :
        User-wise Popularity matrix in csr sparse format
    """
    n_users = df_train.UID.unique().shape[0]
    if recency > 0:
        # Get the number of user baskets Bu
        BUCount = df_train.groupby(['UID'])['order'].max().reset_index(name='Bu')
        # Calculate the denominator which equal to Min(recency,Bu) for each user
        BUCount['denominator'] = np.minimum(BUCount['Bu'], 5)
        # Calculater the order index, form where we start counting item appearance in recent orders
        BUCount['startindex'] = np.maximum(BUCount['Bu'] - 5, 0)
        # Calcualte item appearance in recent orders
        tmp = pd.merge(BUCount, df_train, on='UID')[
            ['UID', 'PID', 'order', 'startindex']]
        tmp = tmp.loc[(tmp['order'] >= tmp['startindex']) == True].groupby(
            ['UID', 'PID'])['order'].count().reset_index(name='numerator')
        tmp = pd.merge(BUCount[['UID', 'denominator']], tmp, on='UID')
        # finally calculate the recency aware user-wise popularity
        tmp['Score'] = tmp['numerator'] / tmp['denominator']
    else:
        # Calculate user-wise popularity for each item
        BUCount = df_train.groupby(['UID'])['order'].max().reset_index(
            name='Bu')
        BUICount = df_train.groupby(['UID', 'PID'])['BID'].count().reset_index(
            name='Bui')
        tmp = pd.merge(BUICount, BUCount, on='UID')
        del BUICount
        tmp['Score'] = tmp['Bui'] / tmp['Bu']
        del BUCount
        # get the 3 columns needed to construct our user-wise Popularity matrix
    df_UWpop = tmp[['UID', 'PID', 'Score']]
    del tmp
    # Generate user-wise popularity matrix in COOrdinate format
    UWP_mat = sparse.coo_matrix(
        (df_UWpop.Score.values, (df_UWpop.UID.values, df_UWpop.PID.values)),
        shape=(n_users, n_items))
    del df_UWpop
    return sparse.csr_matrix(UWP_mat)


print(f'Files in input folder: {list_files(os.path.join(HOME_DIR, "input"))}')

train = pd.read_parquet(INPUT_PATH / 'train.parquet')

test_ids = pd.read_csv(INPUT_PATH / 'test-ids.csv')

train_data = train[train['id'].isin(test_ids['id'])]
products_count = train_data['cluster_id'].value_counts()
item_threshold = 1
df_order_products = train_data.loc[train_data['cluster_id'].isin(
        products_count[products_count >= item_threshold].index
)]
user_count = df_order_products['id'].value_counts()
user_dict = dict(zip(df_order_products['id'].unique(),
                     range(len(df_order_products['id'].unique()))))
df_order_products['UID'] = df_order_products['id'].map(user_dict)
product_dict = dict(zip(df_order_products['cluster_id'].unique(),
                        range(len(df_order_products['cluster_id'].unique()))))
df_order_products['PID'] = df_order_products['cluster_id'].map(product_dict)
df_order_products['BID'] = df_order_products['order_id']
n_items = df_order_products['PID'].unique().shape[0]
n_users = df_order_products.UID.unique().shape[0]
df_order_products = df_order_products.sort_values(
    by=['UID', 'completed_at']
).reset_index(drop=True)
TRAIN = df_order_products[['UID', 'BID', 'PID']]
s = TRAIN.groupby(['UID', 'BID']).agg(
    {'PID': lambda x: len(x.tolist())}).reset_index()

a = []
count = 1
for i in range(len(s) - 1):
    a += [count] * s.loc[i]['PID']
    if s.loc[i]['UID'] != s.loc[i + 1]['UID']:
        count = 1
    else:
        count += 1

a += [1] * 24

TRAIN['order'] = a

UWP_mat = uwPopMat(TRAIN, n_items, 25)

# 2- Popularity-based Collaborative Filtering

# 2.1- User Popularity-based CF (UP-CF)


def upcf(df_train, UWP_sparse, n_items, alpha=0.25, q=5, k=10):
    n_users = df_train['UID'].unique().shape[0]
    df_user_item = \
        df_train.groupby(['UID', 'PID']).size().reset_index(name="bool")[
            ['UID', 'PID']]
    # Generate the User_item matrix using the parse matrix COOrdinate format.
    userItem_mat = sparse.coo_matrix((np.ones((df_user_item.shape[0])), (
        df_user_item.UID.values, df_user_item.PID.values)),
                                     shape=(n_users, n_items))
    # Calculate the asymmetric similarity cosine matrix
    userSim = sim.asymmetric_cosine(sparse.csr_matrix(userItem_mat), alpha=alpha,
                                    k=k)
    # recommend k items to users
    user_recommendations = sim.dot_product(userSim.power(q), UWP_sparse, k=k)
    return user_recommendations


# 2.2- Item popularity-based Collaborative Filtring

def ipcf(df_train, UWP_sparse, n_items, alpha=0.25, q=5, k=10):
    # Construct the item-basket sparse matrix
    idMax_basket = df_train.BID.max() + 1
    item_basket_mat = sparse.coo_matrix((
        np.ones((df_train.shape[0]), dtype=int),
        (df_train.PID.values,
         df_train.BID.values)),
        shape=(n_items, idMax_basket))
    # Convert it to Compressed Sparse Row format to exploit its efficiency in arithmetic operations
    sparse_mat = sparse.csr_matrix(item_basket_mat)
    # Caculate the Asymetric Cosine Similarity matrix
    itemSimMat = sim.asymmetric_cosine(sparse_mat, None, alpha, k)
    # recommend k items to users
    user_recommendations = sim.dot_product(UWP_sparse, itemSimMat.power(q), k)
    return user_recommendations


user_recommendation = upcf(TRAIN, UWP_mat, n_items)


dict_user = dict(zip(user_dict.values(), user_dict.keys()))
dict_product = dict(zip(product_dict.values(), product_dict.keys()))

sepe = user_recommendation.toarray()

d_results = {}

for idx, row in enumerate(sepe):
    #     if idx % 100 == 0:
    #         print(f'Loaded {idx} values from {sepe.shape[0]}')
    indices = np.where(row != 0)[0]
    values = row[indices]
    mapper = dict(zip(range(len(indices)), indices))
    indices = values.argsort()[-20:][::-1]
    indices = [dict_product[mapper[x]] for x in indices]
    # ind = [dict_product[x] for x in indices]
    if len(indices) < 20:
        d_results[dict_user[idx]] = indices
    else:
        d_results[dict_user[idx]] = indices
        print(idx)

test_ids['target'] = test_ids['id'].apply(lambda x: ';'.join(map(str, d_results[x])))

test_ids.to_csv(OUTPUT_DIR / 'submission.csv', index=False)

Files in input folder: ['cluster-weights.parquet', 'train.parquet', 'clusters.parquet', 'sample-submission.csv', 'metric.py', 'history.parquet', 'test-ids.csv', 'centroids.parquet']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Computing:  61%|██████    | 197601/325088 [08:22<05:24, 393.38it/s]    

In [None]:
d_results