# BPR - FACTORIZATION MACHINES

In this notebook we train the BPR algorithm using the Factorization Machines as the underlying model

In [1]:
import numpy as np
import pandas as pd
from BPR_FM import BPR

In [2]:
df = pd.read_csv('../../categorized_offers/trips_combined_final.csv').drop(columns='Unnamed: 0')

Now that the data has been loaded, we need to filter the mobility requests based on the number of offers. More specifically, we study two different scenarios: requests with 100 offers or less and requests with 10 offers or less

In [3]:
# number of offers of each mobility request
counts = df.groupby('request_id').count().sort_values('offer_id').reset_index()[['request_id','offer_id']]

# filter
max_offers = 10
user_ids_to_save = counts[counts['offer_id']<=max_offers]
cleaned_df = pd.merge(df, user_ids_to_save['request_id'], on='request_id')

We intend to analyze the performance of the ranking algorithm depending on the number of trips registered by the users. For that, we will perform three different experiments:
- Training with all the users
- Training with users who have registered more than 20 trips
- Training with users who have registered more than 40 trips

### Users with more than 40 trips

In [4]:
# filter data based on number of trips registered by the users
trips_users = 40
trips_df = cleaned_df[cleaned_df['Response']==1]
counts = trips_df.groupby('user_id').count().sort_values('request_id').reset_index()[['user_id','request_id']]
trips_ids_to_save = counts[counts['request_id']>=trips_users]

In [5]:
print('Number of users who registered more than {}: {}'.format(trips_users, len(trips_ids_to_save)))

Number of users who registered more than 40: 46


In [6]:
# final dataset
df_to_train = pd.merge(cleaned_df, trips_ids_to_save['user_id'], on='user_id')

Now the dataset is ready, we can train the algorithm. In this case, we train for 120k iterations. Several runs have been performed and after around this number of iterations the performance starts to decrease. The learning rate and regularizer have also been chosen based on some previous runs

In [9]:
reco = BPR(df_to_train)
reco.fit(single_iterations=150000, learning_rate=0.0005, lmbda=0.01)

1/10
Computing metrics...
Metrics for train completed
Metrics for test completed
2/10
Computing metrics...
Metrics for train completed
Metrics for test completed
3/10
Computing metrics...
Metrics for train completed
Metrics for test completed
4/10
Computing metrics...
Metrics for train completed
Metrics for test completed
5/10
Computing metrics...
Metrics for train completed
Metrics for test completed
6/10
Computing metrics...
Metrics for train completed
Metrics for test completed
7/10
Computing metrics...
Metrics for train completed
Metrics for test completed
8/10
Computing metrics...
Metrics for train completed
Metrics for test completed
9/10
Computing metrics...
Metrics for train completed
Metrics for test completed
10/10
Computing metrics...
Metrics for train completed
Metrics for test completed


Finally, we save the metrics for their further examination

In [12]:
# recall at k test 
recall_at_1_test_avg = list() 
recall_at_5_test_avg = list()
recall_at_10_test_avg = list()
for i in range(len(reco.recall_at_k_test)):
    average_1 = sum(reco.recall_at_k_test[i][1].values())/len(reco.recall_at_k_test[i][1])
    recall_at_1_test_avg.append(average_1)
    average_5 = sum(reco.recall_at_k_test[i][5].values())/len(reco.recall_at_k_test[i][5])
    recall_at_5_test_avg.append(average_5)
    average_10 = sum(reco.recall_at_k_test[i][10].values())/len(reco.recall_at_k_test[i][10])
    recall_at_10_test_avg.append(average_10)

recall_at_1_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                                       np.array(recall_at_1_test_avg).reshape(-1,1),],axis=1)

recall_at_5_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                                       np.array(recall_at_5_test_avg).reshape(-1,1),],axis=1)

recall_at_10_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                                       np.array(recall_at_10_test_avg).reshape(-1,1),],axis=1)

# save the averages
np.savetxt('results_lt_10/results_users_gt_40/recall_testing/recall_at_1_test_u_gt_40.txt', recall_at_1_test_avg)
np.savetxt('results_lt_10/results_users_gt_40/recall_testing/recall_at_5_test_u_gt_40.txt', recall_at_5_test_avg)
np.savetxt('results_lt_10/results_users_gt_40/recall_testing/recall_at_10_test_u_gt_40.txt', recall_at_10_test_avg)

In [13]:
# MAP test 
MAP_test_avg = list() 
for i in range(len(reco.MAP_test)):
    average = sum(reco.MAP_test[i].values())/len(reco.MAP_test[i])
    MAP_test_avg.append(average)


MAP_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                               np.array(MAP_test_avg).reshape(-1,1),],axis=1)

# save the averages
np.savetxt('results_lt_10/results_users_gt_40/MAP_testing/MAP_test_u_gt_40.txt', MAP_test_avg)

### Users with more than 20 trips

In [15]:
# filter data based on number of trips registered by the users
trips_users = 20
trips_df = cleaned_df[cleaned_df['Response']==1]
counts = trips_df.groupby('user_id').count().sort_values('request_id').reset_index()[['user_id','request_id']]
trips_ids_to_save = counts[counts['request_id']>=trips_users]

In [16]:
print('Number of users who registered more than {}: {}'.format(trips_users, len(trips_ids_to_save)))

Number of users who registered more than 20: 188


In [17]:
# final dataset
df_to_train = pd.merge(cleaned_df, trips_ids_to_save['user_id'], on='user_id')

In [18]:
reco = BPR(df_to_train)
reco.fit(single_iterations=150000, learning_rate=0.0005, lmbda=0.01)

1/10
Computing metrics...
Metrics for train completed
Metrics for test completed
2/10
Computing metrics...
Metrics for train completed
Metrics for test completed
3/10
Computing metrics...
Metrics for train completed
Metrics for test completed
4/10
Computing metrics...
Metrics for train completed
Metrics for test completed
5/10
Computing metrics...
Metrics for train completed
Metrics for test completed
6/10
Computing metrics...
Metrics for train completed
Metrics for test completed
7/10
Computing metrics...
Metrics for train completed
Metrics for test completed
8/10
Computing metrics...
Metrics for train completed
Metrics for test completed
9/10
Computing metrics...
Metrics for train completed
Metrics for test completed
10/10
Computing metrics...
Metrics for train completed
Metrics for test completed


In [20]:
# recall at k test 
recall_at_1_test_avg = list() 
recall_at_5_test_avg = list()
recall_at_10_test_avg = list()
for i in range(len(reco.recall_at_k_test)):
    average_1 = sum(reco.recall_at_k_test[i][1].values())/len(reco.recall_at_k_test[i][1])
    recall_at_1_test_avg.append(average_1)
    average_5 = sum(reco.recall_at_k_test[i][5].values())/len(reco.recall_at_k_test[i][5])
    recall_at_5_test_avg.append(average_5)
    average_10 = sum(reco.recall_at_k_test[i][10].values())/len(reco.recall_at_k_test[i][10])
    recall_at_10_test_avg.append(average_10)

recall_at_1_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                                       np.array(recall_at_1_test_avg).reshape(-1,1),],axis=1)

recall_at_5_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                                       np.array(recall_at_5_test_avg).reshape(-1,1),],axis=1)

recall_at_10_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                                       np.array(recall_at_10_test_avg).reshape(-1,1),],axis=1)

# save the averages
np.savetxt('results_lt_10/results_users_gt_20/recall_testing/recall_at_1_test_u_gt_20.txt', recall_at_1_test_avg)
np.savetxt('results_lt_10/results_users_gt_20/recall_testing/recall_at_5_test_u_gt_20.txt', recall_at_5_test_avg)
np.savetxt('results_lt_10/results_users_gt_20/recall_testing/recall_at_10_test_u_gt_20.txt', recall_at_10_test_avg)

In [21]:
# MAP test 
MAP_test_avg = list() 
for i in range(len(reco.MAP_test)):
    average = sum(reco.MAP_test[i].values())/len(reco.MAP_test[i])
    MAP_test_avg.append(average)


MAP_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                               np.array(MAP_test_avg).reshape(-1,1),],axis=1)

# save the averages
np.savetxt('results_lt_10/results_users_gt_20/MAP_testing/MAP_test_u_gt_20.txt', MAP_test_avg)

### All users

In [22]:
# filter data based on number of trips registered by the users
trips_users = 0
trips_df = cleaned_df[cleaned_df['Response']==1]
counts = trips_df.groupby('user_id').count().sort_values('request_id').reset_index()[['user_id','request_id']]
trips_ids_to_save = counts[counts['request_id']>=trips_users]

In [23]:
print('Number of users who registered more than {}: {}'.format(trips_users, len(trips_ids_to_save)))

Number of users who registered more than 0: 1517


In [24]:
# final dataset
df_to_train = pd.merge(cleaned_df, trips_ids_to_save['user_id'], on='user_id')

In [25]:
reco = BPR(df_to_train)
reco.fit(single_iterations=150000, learning_rate=0.0005, lmbda=0.01)

1/10
Computing metrics...
Metrics for train completed
Metrics for test completed
2/10
Computing metrics...
Metrics for train completed
Metrics for test completed
3/10
Computing metrics...
Metrics for train completed
Metrics for test completed
4/10
Computing metrics...
Metrics for train completed
Metrics for test completed
5/10
Computing metrics...
Metrics for train completed
Metrics for test completed
6/10
Computing metrics...
Metrics for train completed
Metrics for test completed
7/10
Computing metrics...
Metrics for train completed
Metrics for test completed
8/10
Computing metrics...
Metrics for train completed
Metrics for test completed
9/10
Computing metrics...
Metrics for train completed
Metrics for test completed
10/10
Computing metrics...
Metrics for train completed
Metrics for test completed


In [26]:
# recall at k test 
recall_at_1_test_avg = list() 
recall_at_5_test_avg = list()
recall_at_10_test_avg = list()
for i in range(len(reco.recall_at_k_test)):
    average_1 = sum(reco.recall_at_k_test[i][1].values())/len(reco.recall_at_k_test[i][1])
    recall_at_1_test_avg.append(average_1)
    average_5 = sum(reco.recall_at_k_test[i][5].values())/len(reco.recall_at_k_test[i][5])
    recall_at_5_test_avg.append(average_5)
    average_10 = sum(reco.recall_at_k_test[i][10].values())/len(reco.recall_at_k_test[i][10])
    recall_at_10_test_avg.append(average_10)

recall_at_1_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                                       np.array(recall_at_1_test_avg).reshape(-1,1),],axis=1)

recall_at_5_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                                       np.array(recall_at_5_test_avg).reshape(-1,1),],axis=1)

recall_at_10_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                                       np.array(recall_at_10_test_avg).reshape(-1,1),],axis=1)

# save the averages
np.savetxt('results_lt_10/results_users/recall_testing/recall_at_1_test_u.txt', recall_at_1_test_avg)
np.savetxt('results_lt_10/results_users/recall_testing/recall_at_5_test_u.txt', recall_at_5_test_avg)
np.savetxt('results_lt_10/results_users/recall_testing/recall_at_10_test_u.txt', recall_at_10_test_avg)

In [27]:
# MAP test 
MAP_test_avg = list() 
for i in range(len(reco.MAP_test)):
    average = sum(reco.MAP_test[i].values())/len(reco.MAP_test[i])
    MAP_test_avg.append(average)


MAP_test_avg = np.concatenate([np.array(reco.metrics_iterations).reshape(-1,1),
                               np.array(MAP_test_avg).reshape(-1,1),],axis=1)

# save the averages
np.savetxt('results_lt_10/results_users/MAP_testing/MAP_test_u.txt', MAP_test_avg)

Changing the variable ``max_offers`` to 100, we generate the results for the other case 