In [20]:
#!pip install --upgrade implicit
import numpy as np
import pandas as pd
import pickle
import os; os.environ['OPENBLAS_NUM_THREADS']='1'
from implicit.als import AlternatingLeastSquares as ALS
from implicit.evaluation import mean_average_precision_at_k
from scipy.sparse import coo_matrix

In [2]:
# Loading Data frames
base_path = '/Users/saipavan/Downloads/Datazoids/'
#base_path = '/Users/bhadrashah/Downloads/'
train_csv = f'{base_path}transactions_train.csv.zip'
users_path = f'{base_path}customers.csv.zip'
articles_path = f'{base_path}articles.csv.zip'
df = pd.read_csv(train_csv, dtype={'article_id': str}, parse_dates=['t_dat'])
users = pd.read_csv(users_path)
articles = pd.read_csv(articles_path, dtype={'article_id': str})


In [3]:
# autoincrementing ids starting from 0 to both users and items
all_users = users['customer_id'].unique().tolist()
all_items = articles['article_id'].unique().tolist()
user_ids_dict = dict(list(enumerate(all_users)))
item_ids_dict = dict(list(enumerate(all_items)))
users_map = {user_uid: user_idx for user_idx, user_uid in user_ids_dict.items()}
items_map = {item_uid: item_idx for item_idx, item_uid in item_ids_dict.items()}
df['user_id'] = df['customer_id'].map(users_map)
df['item_id'] = df['article_id'].map(items_map)

del users, articles, user_ids_dict, item_ids_dict

In [4]:
# creating a sparse matrix using coo_matrix function in users x item format.
rows = df['user_id'].values
columns = df['item_id'].values
data = np.ones(df.shape[0])
coo_train_matrix = coo_matrix((data, (rows, columns)), shape=(len(all_users), len(all_items)))
coo_train_matrix

<1371980x105542 sparse matrix of type '<class 'numpy.float64'>'
	with 31788324 stored elements in COOrdinate format>

In [5]:
# basic check for model and data compatability
model = ALS(iterations=2,factors=10)
model.fit(coo_train_matrix)



  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Functions 
def split_dataset(df, validation_days=7):
    # Split a pandas dataframe into training and validation data, based on validation_days
    data_split_time = df['t_dat'].max() - pd.Timedelta(validation_days)
    train_df = df[df['t_dat'] < data_split_time]
    val_df = df[df['t_dat'] >= data_split_time]
    return train_df, val_df

def user_item_coo_matrix(df):
    # Turn a dataframe with transactions into a COO sparse matrix of items x users format
    rows = df['user_id'].values
    columns = df['item_id'].values
    data = np.ones(df.shape[0])
    coo_m = coo_matrix((data, (rows, columns)), shape=(len(all_users), len(all_items)))
    return coo_m


def csr_matrices(df, validation_days=7):
   
    train_df, val_df = split_dataset(df, validation_days=validation_days)
    coo_train_matrix = user_item_coo_matrix(train_df)
    coo_value = user_item_coo_matrix(val_df)

    csr_train_matrix = coo_train_matrix.tocsr()
    csr_value = coo_value.tocsr()
    
    return {'coo_train_matrix': coo_train_matrix, 'csr_train_matrix': csr_train_matrix, 'csr_value': csr_value}


def validation(matrices_temp, iterations=20, factors=200, regularization=0.01):
    csr_train_matrix=matrices_temp['csr_train_matrix']
    csr_value =  matrices_temp['csr_value']
    coo_train_matrix=matrices_temp['coo_train_matrix']
    model = ALS(factors=factors,  regularization=regularization, random_state=42,iterations=iterations)
    model.fit(coo_train_matrix)
    map_12 = mean_average_precision_at_k(model, csr_train_matrix, csr_value, K=12, num_threads=4)
    print(f"Factors: {factors:>3}")
    print(f"Iterations: {iterations:>2}" )
    print(f"Regularization: {regularization:4.3f} ")
    print(f"MAP 12: {map12:6.9f}")
    return map_12




In [7]:
new_df = df[df['t_dat'] > '2020-08-21']
matrices =  csr_matrices(new_df)

In [8]:
%%time
# alternating the parameters to get the best model parameters
best_map = 0
for factors in [40, 50, 60, 100, 200, 500]:
    for iterations in [3, 12, 15, 20]:
        map12 = validation(matrices, factors, iterations, 0.01, show_progress=False)
        if map12 > best_map:
            best_map = map12
            best_params = {'factors': factors, 'iterations': iterations, 'regularization': 0.01}
            print(f"Best MAP found. The new best parameters are: {best_params}")



Factors:  40 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00392
Best MAP found. The new best parameters are: {'factors': 40, 'iterations': 3, 'regularization': 0.01}




Factors:  40 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00533
Best MAP found. The new best parameters are: {'factors': 40, 'iterations': 12, 'regularization': 0.01}




Factors:  40 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00528




Factors:  40 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00532




Factors:  50 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00461




Factors:  50 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00543
Best MAP found. The new best parameters are: {'factors': 50, 'iterations': 12, 'regularization': 0.01}




Factors:  50 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00534




Factors:  50 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00528




Factors:  60 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00455




Factors:  60 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00575
Best MAP found. The new best parameters are: {'factors': 60, 'iterations': 12, 'regularization': 0.01}




Factors:  60 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00576
Best MAP found. The new best parameters are: {'factors': 60, 'iterations': 15, 'regularization': 0.01}




Factors:  60 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00578
Best MAP found. The new best parameters are: {'factors': 60, 'iterations': 20, 'regularization': 0.01}




Factors: 100 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00536




Factors: 100 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00621
Best MAP found. The new best parameters are: {'factors': 100, 'iterations': 12, 'regularization': 0.01}




Factors: 100 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00621




Factors: 100 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00621
Best MAP found. The new best parameters are: {'factors': 100, 'iterations': 20, 'regularization': 0.01}




Factors: 200 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00653
Best MAP found. The new best parameters are: {'factors': 200, 'iterations': 3, 'regularization': 0.01}




Factors: 200 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00649




Factors: 200 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00641




Factors: 200 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00639




Factors: 500 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00698
Best MAP found. The new best parameters are: {'factors': 500, 'iterations': 3, 'regularization': 0.01}




Factors: 500 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00609




Factors: 500 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00590




Factors: 500 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00583
CPU times: user 4h 57min 50s, sys: 45min 32s, total: 5h 43min 23s
Wall time: 47min 57s


In [9]:
map12 = validation(matrices, factors=200, iterations=3, regularization=0.01, show_progress=True)
print(map12)



  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10528 [00:00<?, ?it/s]

Factors: 200 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00653
0.006525713737900545


In [10]:
del matrices

In [11]:
# Training over the full dataset
coo_train_matrix = user_item_coo_matrix(df)
csr_train_matrix = coo_train_matrix.tocsr()

In [12]:
best_params = {"factors":200, "iterations":15, "regularization":0.01, "show_progress":True}

In [14]:
model = ALS(factors=200, regularization=0.01,iterations=15, random_state=50)
model.fit(coo_train_matrix, show_progress=True)
def output(model, csr_train_matrix, file_name):
    predicted_value = []
    batch_size = 2000
    users_indices = np.arange(len(all_users))
    for start_idx in range(0, len(users_indices), batch_size):
        batch = users_indices[start_idx : start_idx + batch_size]
        ids, scores = model.recommend(batch, csr_train_matrix[batch], N=6, filter_already_liked_items=False)
        for i, userid in enumerate(batch):
            customer_id = all_users[userid]
            user_items = ids[i]
            article_ids = [all_items[item_id] for item_id in user_items]
            preddicted_value.append((customer_id, ' '.join(article_ids)))

    data_predicted = pd.DataFrame(predicted_value, columns=['customer_id', 'prediction'])
    data_predicted.to_csv(file_name, index=False)
    
    display(data_predicted.head())
    print(data_predicted.shape)
    
    return data_predicted



In [15]:
data_pred = submit(model, csr_train, "submissions-Implicit-ALS.csv");

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601006 0795440001 0795440003 0568597006 06...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0351484002 0759871002 0599580055 0673677002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0351484002 0723529001 0609719001 0458543001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0730683001 0564786001 0678687001 0708021001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0590928013 0712924003 0698286003 0692721005 07...


(1371980, 2)
CPU times: user 1h 4min 46s, sys: 1min 19s, total: 1h 6min 5s
Wall time: 8min 31s


In [16]:
new_df = df.drop(['t_dat', 'sales_channel_id', 'price', 'user_id', 'item_id'],  axis=1)
new_df

Unnamed: 0,customer_id,article_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0541518023
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,0505221004
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,0685687003
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,0685687004
...,...,...
31788319,fff2282977442e327b45d8c89afde25617d00124d0f999...,0929511001
31788320,fff2282977442e327b45d8c89afde25617d00124d0f999...,0891322004
31788321,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,0918325001
31788322,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,0833459002


In [17]:
new_df = new_df.groupby('customer_id')['article_id'].apply(list).reset_index(name="article_id")
new_df

Unnamed: 0,customer_id,article_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[0625548001, 0176209023, 0627759010, 069713800..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[0583558001, 0639677008, 0640244003, 052126900..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0663713001, 0541518023, 0663713001, 057802000..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0742079001, 0732413001]"
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0634249005, 0677049001, 0698286003, 070770400..."
...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,"[0698276003, 0699075005, 0694182002, 072243600..."
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,"[0671695001, 0562245015, 0562245018, 056224500..."
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,"[0568597019, 0484398001, 0484398001, 070108300..."
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,"[0821395003, 0806241002, 0714790020, 086675500..."


In [21]:
# This data will be used in app.py (streamlit app) to know the customers list and the items that they have bought.
pickle.dump(new_df, open('customer_transactions_embeddings.pkl','wb'))


In [23]:
np.random.choice(new_df['customer_id'])


'1f8655f56e47c76038be8699291a7fe8a3aefb4e475ac7a30c6f5c45c0ef196a'

In [24]:
np.random.choice(new_df['customer_id'])


'012e38a268e9bbe4cb8f6e4caf02bbf60b46a2da79df7887f26759aa7adb57b7'