In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
from sklearn.model_selection import train_test_split
import random
import implicit
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
import ndcg
import MAP
from functools import reduce
import matplotlib as plt
from ast import literal_eval
from multiprocessing import Pool
%matplotlib inline

In [2]:
# dataset nuevo 
path = './datasets_recsys/favorite_stories_books_sample.csv'
ff_data = pd.read_csv(path, sep=';', index_col=None, encoding='latin') 
ff_data = ff_data[['user_id','story_id','franchise']]
ff_data.head()

Unnamed: 0,user_id,story_id,franchise
0,102390,7295117,Harry Potter
1,102390,10629488,Harry Potter
2,102390,10089646,Harry Potter
3,102390,8945573,Harry Potter
4,102390,8037808,Harry Potter


In [3]:
cleaned_data = ff_data.loc[pd.isnull(ff_data.user_id) == False]
cleaned_data.head()

Unnamed: 0,user_id,story_id,franchise
0,102390,7295117,Harry Potter
1,102390,10629488,Harry Potter
2,102390,10089646,Harry Potter
3,102390,8945573,Harry Potter
4,102390,8037808,Harry Potter


In [4]:
item_lookup = cleaned_data[['story_id', 'franchise']].drop_duplicates() 
item_lookup['story_id'] = item_lookup.story_id.astype(str) 

In [5]:
cleaned_data['user_id'] = cleaned_data.user_id.astype(int) 
cleaned_data['Quantity'] = 1
cleaned_data = cleaned_data[['story_id', 'Quantity', 'user_id']] 
grouped_cleaned = cleaned_data.groupby(['user_id', 'story_id']).sum().reset_index() 
grouped_cleaned.Quantity.loc[grouped_cleaned.Quantity == 0] = 1 
grouped_stories = grouped_cleaned.query('Quantity > 0')

In [6]:
grouped_stories.head()

Unnamed: 0,user_id,story_id,Quantity
0,8139,282139,1
1,8139,288735,1
2,8139,1277839,1
3,8139,2700556,1
4,8139,2800923,1


### Creamos la sparse matrix

In [7]:
users = list(np.sort(grouped_stories.user_id.unique())) 
stories = list(grouped_stories.story_id.unique()) 
quantity = list(grouped_stories.Quantity) 
rows = grouped_stories.user_id.astype('category', categories = users).cat.codes 
cols = grouped_stories.story_id.astype('category', categories = stories).cat.codes 
favorite_stories_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(users), len(stories)))
favorite_stories_sparse

<15000x302043 sparse matrix of type '<class 'numpy.int64'>'
	with 937110 stored elements in Compressed Sparse Row format>

### Vemos cuan "sparsed" estan los datos:

In [8]:
matrix_size = favorite_stories_sparse.shape[0]*favorite_stories_sparse.shape[1] 
num_fav_stories = len(favorite_stories_sparse.nonzero()[0]) 
sparsity = 100*(1 - (num_fav_stories/matrix_size))
sparsity

99.97931619007889

In [9]:
'''recibe matriz sparsed y la divide 80/20'''

def make_train(sparse_matrix, pct_test = 0.2):
    total_users = 15000 # users
    training_set = sparse_matrix[0: (total_users - (total_users *pct_test))] 
    test_set = sparse_matrix[(total_users - (total_users *pct_test))::]
    return training_set, test_set  # retornamos una matriz sparse de train y de testing 

In [10]:
favorite_stories_sparse.nonzero()

(array([    0,     0,     0, ..., 14999, 14999, 14999], dtype=int32),
 array([     0,      1,      2, ..., 244472, 268254, 270641], dtype=int32))

In [11]:
# MATRIZ SPARSED (TRAIN Y TEST)
train_dataset, test_dataset = make_train(favorite_stories_sparse, pct_test = 0.2)

# SPLIT DATAFRAME 
train, test = train_test_split(grouped_stories, test_size=0.172)


### ocupando libreria implicit: 

In [12]:
#averiguar que significa el alpha... 
alpha = 15
#alpha - parametro discutido en el paper de Koren & Volinsky asociado a la matriz de confianza, 
#donde Cui = 1 + alpha*Rui. 
# Rui --> preferencia del usuario "u" por el item "i" , si Rui > 0 hay una preferencia (interaccion)
#En el paper obtienen que el default de alpha debe ser 40 (el mas efectivo) . 

user_vecs, item_vecs = implicit.alternating_least_squares((train_dataset*alpha).astype('double'), 
                                                          factors=300, 
                                                          regularization = 0.1, 
                                                         iterations = 1)

This method is deprecated. Please use the AlternatingLeastSquares class instead


In [18]:
user_vecs[0,:]

NameError: name 'user_vecs' is not defined

In [12]:
users_arr = np.array(users) 
stories_arr = np.array(stories) 

In [13]:
def get_favorite_stories(user_id):
    return cleaned_data.loc[cleaned_data['user_id'] == user_id][['story_id']]

def get_favorite_stories_test(user_id):
    return test.loc[test['user_id'] == user_id][['story_id']]

In [14]:
users_arr[:10]

array([ 8139,  8334,  8945, 10907, 16056, 17420, 19705, 21565, 22848, 23131])

### Favorite stories user: 8334

In [15]:
get_favorite_stories_test(8334)

Unnamed: 0,story_id
24,1297048


In [16]:
# ver bien como recomienda items, que hace esta funcion
def rec_items(user_id, mf_train, user_vecs, item_vecs, user_list, item_list, item_lookup, num_items = 10):    
    cust_ind = np.where(user_list == user_id)[0][0] # obtener el index de users id
    pref_vec = mf_train[cust_ind,:].toarray() 
    pref_vec = pref_vec.reshape(-1) + 1 # asignamos 1 a todo, asi las historias no favoriteadas tendrán valor 1. 
    pref_vec[pref_vec > 1] = 0 # reemplazamos todo lo que ya fue favorito en zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # producto punto users e items transpuesto
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # todos los items ya recomendados se multiplican por cero 
    recommend_vector = pref_vec*rec_vector_scaled 
    stories_idx = np.argsort(recommend_vector)[::-1][:num_items] # ordenamos los items y 
    # arriba quedan las mejores recomendaciones 
    rec_list = [] 
    for index in stories_idx:
        code = item_list[index]
        rec_list.append(code) 
    return rec_list # devolvemos una lista con los top 5 (num_items)

In [17]:
rec_items(8334, test_dataset, user_vecs, item_vecs, users_arr, stories_arr, item_lookup,
                       num_items = 5)

NameError: name 'user_vecs' is not defined

In [25]:
#user_ids = list(users_arr)

#data = []

#for x in user_ids:
    
 #   try:
    
  
  #      recommendation = rec_items(x,test_dataset, user_vecs, item_vecs, users_arr, stories_arr, item_lookup,
                        #   num_items = 5)
   #     data.append([x,recommendation])
    
   # except:
    #    continue

    

#recsys = pd.DataFrame(data, columns = ['user', 'recommendation'])

#recsys.to_csv('recommendation_results.csv', index = False)



In [18]:
# funcion para obtener la franquicia 
def get_franchise(item_id): 
    try:
        result = list(set(ff_data.loc[ff_data['story_id'] == item_id]['franchise']))
        return result
    except:
        pass

### Recomendacion implicit feedback: entrena con train, recomienda con test, ground truth son historias favoritas de testing. (como recomienda a users del set de testing tienen que ser 3.000 que es el 20% del set de training 15.000) 

In [48]:
import time 

# probar con nDCG@5, precision@5
user_ids = list(users_arr)
alpha = 60 

#alpha - parametro discutido en el paper de Koren & Volinsky sobre la matriz de confianza, 
#donde Cui = 1 + alpha*Rui. 
# Rui --> preferencia del usuario "u" por el item "i" , si Rui > 0 hay una preferencia (interaccion)
#En el paper obtienen que el default de alpha debe ser 40 (el mas efectivo) . 
# Al disminuir alpha disminuye la variabilidad en el nivel de confianza entre varios items.


data = []

# entrena con dataset de training 
user_vecs, item_vecs = implicit.alternating_least_squares((train_dataset*alpha).astype('double'), 
                                                              factors=300, 
                                                              regularization = 0.1, 
                                                             iterations = 2)
t0 = time.time()
for i in user_ids[0:1000]:
      
    # recomienda con el dataset de test recibe los vectores user/item del entrenamiento de modelo 
    recommendation = rec_items(i, test_dataset, user_vecs, item_vecs, users_arr, stories_arr, item_lookup,
                           num_items = 5)
    
    # obtiene historias favoritas del set de testing
    fav_stories = list(get_favorite_stories_test(i)['story_id'])

    # si la historia recomendada esta dentro de sus favoritas del set de test, relevantes append 1  
    relevantes_implicit = []  
    
    for x in recommendation:
        
        if x in fav_stories:
            relevantes_implicit.append(1)
            
        else:
            relevantes_implicit.append(0)
    
    
    #print(recommendation)
    #print(fav_stories)
    
    relevantes_implicit = relevantes_implicit[0:5]

    # calculamos nDCG@10 y precision@10
    ndcg5 = ndcg.ndcg(relevantes_implicit,5)
    precision5 = MAP.precision_at_k(relevantes_implicit,5)
    lista_MAP = [MAP.precision_at_k(relevantes_implicit,x) for x in range(1, len(relevantes_implicit))]
    mean_average_precision = reduce(lambda x, y: x + y, lista_MAP) / len(lista_MAP)

    data.append(['{}'.format(i), ndcg5, precision5, mean_average_precision])
    df_iter = pd.DataFrame(data, columns = ('user', 'ndcg@5', 'precision@5', 'MAP'))

t1 = time.time()
    
print('Recomendacion total: {}\n Recomendacion por usuario: {}'.format((t1-t0), (t1-t0)/1000))
    
df_iter.to_csv('implicit_ndcg_story_results_books.csv', index=False)
     

This method is deprecated. Please use the AlternatingLeastSquares class instead


Recomendacion total: 75.96728491783142
 Recomendacion por usuario: 0.07596728491783142


In [49]:
results = pd.read_csv('implicit_ndcg_story_results_books.csv', sep=',')
results.describe()

Unnamed: 0,user,ndcg@5,precision@5,MAP
count,1000.0,1000.0,1000.0,1000.0
mean,340388.028,0.298838,0.1182,0.131354
std,139722.018092,0.361842,0.149436,0.209502
min,8139.0,0.0,0.0,0.0
25%,248368.5,0.0,0.0,0.0
50%,358484.5,0.0,0.0,0.0
75%,450241.75,0.582553,0.2,0.208333
max,551704.0,1.0,0.8,0.9375


### Recomendacion aleatoria: 

In [19]:
franchises = list(set(ff_data['franchise']))

In [22]:
import time 

# probar con nDCG@5, precision@5

# muestra aleatoria de 1000 user ids (probar con todos). 
user_ids = list(users_arr)

data = []


t0 = time.time()
for i in user_ids[0:3000]: 
    
    # ingresar id usuario al que le vamos a recomendar 
    user_id = i

    recommendation_random = random.sample(list(test['story_id'].unique()),5)     
    
    fav_stories = list(get_favorite_stories_test(user_id)['story_id'])
    
    
    # si la franquicia recomendada --> favoritas testingle agregamos 
    
    relevantes_implicit = []  
    
    for x in recommendation_random:
        if x in fav_stories:
            relevantes_implicit.append(1)
        else:
            relevantes_implicit.append(0)
      
    # calculamos nDCG@5 y precision@5
    ndcg5 = ndcg.ndcg(relevantes_implicit,5)
    precision5 = MAP.precision_at_k(relevantes_implicit,5)
    lista_MAP = [MAP.precision_at_k(relevantes_implicit,x) for x in range(1, len(relevantes_implicit))]
    mean_average_precision = reduce(lambda x, y: x + y, lista_MAP) / len(lista_MAP)
    
    data.append(['{}'.format(i), ndcg5, precision5, mean_average_precision])
    df_random = pd.DataFrame(data, columns = ('user', 'ndcg@5', 'precision@5', 'MAP'))

t1 = time.time()
    
print('Recomendacion total: {}\n Recomendacion por usuario: {}'.format((t1-t0), (t1-t0)/1000))
#df_random

df_random.to_csv('ndcg_results__stories_random_books.csv',index=False)

Recomendacion total: 29.41788911819458
 Recomendacion por usuario: 0.029417889118194582


In [23]:
results_random = pd.read_csv('ndcg_results__stories_random_books.csv', sep=',')
results_random.describe()

Unnamed: 0,user,ndcg@5,precision@5,MAP
count,3000.0,3000.0,3000.0,3000.0
mean,727941.4,0.000144,6.7e-05,2.1e-05
std,334620.6,0.007863,0.003651,0.001141
min,8139.0,0.0,0.0,0.0
25%,450309.2,0.0,0.0,0.0
50%,737589.5,0.0,0.0,0.0
75%,1009976.0,0.0,0.0,0.0
max,1297258.0,0.430677,0.2,0.0625


### Recomendacion most popular:

In [24]:
most_popular = ff_data['story_id'].value_counts().index.tolist()[0:5]
most_popular

[2636963, 5100876, 5782108, 6291747, 5319052]

In [25]:
# probar con nDCG@5, precision@5

# muestra aleatoria de 1000 user ids. 
user_ids = list(users_arr)

data = []


t0 = time.time()

for i in user_ids[0:3000]: 
    
    # ingresar id usuario al que le vamos a recomendar 
    user_id = i
    
    fav_stories = list(get_favorite_stories_test(user_id)['story_id'])
    
    relevantes_implicit = []  
       
    for x in most_popular:
        if x in fav_stories:
            relevantes_implicit.append(1)
        
        else:
            relevantes_implicit.append(0)
    
    relevantes_implicit = relevantes_implicit[0:5]
      
    # calculamos nDCG@5 y precision@10
    ndcg5 = ndcg.ndcg(relevantes_implicit,5)
    precision5 = MAP.precision_at_k(relevantes_implicit,5)
    lista_MAP = [MAP.precision_at_k(relevantes_implicit,x) for x in range(1, len(relevantes_implicit))]
    mean_average_precision = reduce(lambda x, y: x + y, lista_MAP) / len(lista_MAP)
    
    data.append(['{}'.format(i), ndcg5, precision5, mean_average_precision])
    df_most_popular = pd.DataFrame(data, columns = ('user', 'ndcg@5', 'precision@5', 'MAP'))

#df_most_popular
t1 = time.time()
    
print('Recomendacion total: {}\n Recomendacion por usuario: {}'.format((t1-t0), (t1-t0)/1000))

df_most_popular.to_csv('ndcg_stories_results_popular_books.csv',index=False)


Recomendacion total: 6.75273585319519
 Recomendacion por usuario: 0.00675273585319519


In [26]:
results_popular = pd.read_csv('ndcg__stories_results_popular_books.csv', sep=',')
results_popular.describe()

FileNotFoundError: File b'ndcg__stories_results_popular_books.csv' does not exist

In [41]:
results_rec_FM = pd.read_csv('FM_results_story_features_PICKLE_10_iteraciones.csv', sep=';')
results_rec_FM.head()

Unnamed: 0,user_id,rec_stories
0,102390,"[9060508, 6560750, 11298223, 3157478, 10562882]"
1,4767346,"[3157478, 6560750, 2493456, 4536005, 2477819]"
2,5695337,"[9060508, 11298223, 6704012, 9681062, 10096096]"
3,5778823,"[4946293, 7965930, 5716145, 6550419, 5882420]"
4,6861153,"[9060508, 11298223, 6704012, 9681062, 10096096]"


In [42]:
# convertimos cada elemento de la df a lista 
results_rec_FM['rec_stories'] = results_rec_FM['rec_stories'].apply(lambda x: literal_eval(x))

In [43]:
results_rec_FM = results_rec_FM.drop_duplicates(subset='user_id', keep="last")

In [44]:
list(results_rec_FM.loc[results_rec_FM['user_id'] == 102390]['rec_stories'])[0]

[3157478, 6560750, 2400483, 9060508, 8509020]

### calculamos los resultados de Factorization Machines con el mismo train y test de implicit feedback

In [45]:
# probar con nDCG@5, precision@5

# muestra aleatoria de 1000 user ids. 
user_ids = list(results_rec_FM['user_id'])

data = []

for i in user_ids: 
    
    # ingresar id usuario al que le vamos a recomendar 
    user_id = i
    
    fav_stories = list(get_favorite_stories_test(user_id)['story_id'])
    
    relevantes_FM = []  
    
    FM_recommendation = list(results_rec_FM.loc[results_rec_FM['user_id'] == i]['rec_stories'])[0]
       
    for x in FM_recommendation:
        if x in fav_stories:
            relevantes_FM.append(1)
        
        else:
            relevantes_FM.append(0)
    
    relevantes_FM = relevantes_FM[0:5]
      
    # calculamos nDCG@5 y precision@10
    ndcg5 = ndcg.ndcg(relevantes_FM,5)
    precision5 = MAP.precision_at_k(relevantes_FM,5)
    lista_MAP = [MAP.precision_at_k(relevantes_FM,x) for x in range(1, len(relevantes_FM))]
    mean_average_precision = reduce(lambda x, y: x + y, lista_MAP) / len(lista_MAP)
    
    data.append(['{}'.format(i), ndcg5, precision5, mean_average_precision])
    df_FM = pd.DataFrame(data, columns = ('user', 'ndcg@5', 'precision@5', 'MAP'))

#df_most_popular

df_FM.to_csv('ndcg_stories_results_FM.csv',index=False)



In [46]:
result_FM = pd.read_csv('ndcg_stories_results_FM.csv')

In [47]:
result_FM.describe()

Unnamed: 0,user,ndcg@5,precision@5,MAP
count,861.0,861.0,861.0,861.0
mean,3702570.0,0.017355,0.005807,0.00663
std,2223395.0,0.109806,0.034958,0.053535
min,10907.0,0.0,0.0,0.0
25%,1805672.0,0.0,0.0,0.0
50%,3445249.0,0.0,0.0,0.0
75%,5360133.0,0.0,0.0,0.0
max,9227721.0,1.0,0.4,0.791667
