Info
https://notebooks.gesis.org/binder/jupyter/user/ipython-ipython-in-depth-p1yfe52k/notebooks/binder/Index.ipynb

Project
https://towardsdatascience.com/how-to-build-a-movie-recommendation-system-67e321339109

Other resource
https://github.com/nishantml/NETFLIX-MOVIE-RECOMMENDATION-SYSTEM/blob/master/Netflix_Movie.ipynb

In [1]:
import pandas as pd
data = pd.read_csv("ml-25m/ratings.csv")
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [2]:
data = data.drop('timestamp', axis=1)
data.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


# Splitting the data

In [3]:
data.shape

(25000095, 3)

In [4]:
train_data = data.iloc[:int(data.shape[0]*0.80)]
test_data = data.iloc[int(data.shape[0]*0.80):]

In [5]:
train_data.shape

(20000076, 3)

In [6]:
test_data.shape

(5000019, 3)

# Matrix Factorization

In [7]:
from surprise import SVD
import numpy as np
import surprise
from surprise import Reader, Dataset
from scipy import sparse

In [8]:
# It is to specify how to read the dataframe.
# for our dataframe, we don't have to specify anything extra..
reader = Reader(rating_scale=(1,5))

# create the traindata from the dataframe...
train_data_mf = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

# build the trainset from traindata.., It is of dataset format from surprise library..
trainset = train_data_mf.build_full_trainset() 

In [9]:
# create the testata from the dataframe...
test_data_mf = Dataset.load_from_df(test_data[['userId', 'movieId', 'rating']], reader)

# build the testset from traindata.., It is of dataset format from surprise library..
testset = test_data_mf.build_full_trainset() 

In [10]:
svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b016d9aa60>

In [11]:
# Guradando las predicciones de entrenamiento y obteniendolas del trainset
train_preds = svd.test(trainset.build_testset())

train_pred_mf = np.array([pred.est for pred in train_preds])

In [12]:
train_pred_mf

array([4.49664335, 4.02944301, 3.84726905, ..., 3.80010959, 1.64490751,
       3.02036228])

In [13]:
# Guardando las predicciones de prueba y obteniendolas del testset
test_preds = svd.test(testset.build_testset())

test_pred_mf = np.array([pred.est for pred in test_preds])

In [14]:
test_pred_mf

array([3.59513273, 2.87269608, 2.81276874, ..., 2.37310274, 4.11912542,
       3.72471638])

# Preparando el data frame de entrenamiento

In [15]:
# Creando una matriz dispersa (sparse matrix)

train_sparse_matrix = sparse.csr_matrix((train_data.rating.values, (train_data.userId.values, train_data.movieId.values)))

In [16]:
#Promedio global de las peliculas por los usuarios
train_averages = dict()

#Obtener el promedio glob al the raitings in nuestro set de entrenamiento
train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average
train_averages

{'global': 3.5336391971710506}

In [17]:
# Obtener el promedio de los usuarios en un diccionario (key: user_id / movie_id, value: abg raiting)

def get_average_raitings(sparse_matrix, of_users):
    
    # Promedio de ratings de usuarios/axes
    ax = 1 if of_users else 0 # 1 - User axes, 0 - Movie axes
    
    # ".A1" es para convertir Column_Matrix a 1-D numpy array
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    # Matriz booleana de matrices (Si un usuario calificó esa película o no)
    is_rated = sparse_matrix != 0
    # No de raitings de cada usuario o película
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    # max_user y max_movie ids en una matriz dispersa
    u,m = sparse_matrix.shape
    # Crear un diccionario de usuarios y sus ratings promedio
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i] for i in range(u if of_users else m) if no_of_ratings[i] != 0}
    
    # Devovler el diccionario the ratings promedio
    return average_ratings

In [18]:
# Raiting promedio obtenido por un usario
train_averages['user'] = get_average_raitings(train_sparse_matrix, of_users=True)
print('\nRating promedio del usuario 25: ', train_averages['user'][25])


Rating promedio del usuario 25:  3.528735632183908


In [19]:
# Raiting promedio para una película
train_averages['movie'] = get_average_raitings(train_sparse_matrix, of_users=False)
print('\nRating promedio de la película 40: ', train_averages['movie'][40])


Rating promedio de la película 40:  3.6320907617504052


In [20]:
# Obtener usuarios, películas and ratings de nuestra matriz dispersa de meustras de entrenamiento
train_users, train_movies, train_ratings = sparse.find(train_sparse_matrix)

In [35]:
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity

final_data = pd.DataFrame()
count = 0
for (user, movie, rating)  in zip(train_users, train_movies, train_ratings):
            start = datetime.now()
            
            #print(user, movie)    
            
            # Calificacion de "movie" por similaridad de usuario de "user"
            # Calcular la similaridad de usariores de "user"        
            user_sim = cosine_similarity(train_sparse_matrix[user], train_sparse_matrix).ravel()
            top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # Obtener los ratings de los usuarios más similares para esta película
            top_ratings = train_sparse_matrix[top_sim_users, movie].toarray().ravel()
            # Vamos a hacer una longitud de "5" agregando el promedio de las peliculas
            top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_users_ratings.extend([train_averages['movie'][movie]]*(5 - len(top_sim_users_ratings)))
            # print(top_sim_users_ratings, end=" ")    


            # Ratings de "user" para peliculas similares de "movie"
            # Calcular las peliculas similares de "movie"        
            movie_sim = cosine_similarity(train_sparse_matrix[:,movie].T, train_sparse_matrix.T).ravel()
            top_sim_movies = movie_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # Obtener los ratings de las películas más similares calificados por el usuario
            top_ratings = train_sparse_matrix[user, top_sim_movies].toarray().ravel()
            # Vamos a hacer una longitud de "5" agregando el promedio de los usuarios
            top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_movies_ratings.extend([train_averages['user'][user]]*(5-len(top_sim_movies_ratings))) 
            # print(top_sim_movies_ratings, end=" : -- ")

            # Preparar la fila para guardarlo en un archivo
            row = list()
            row.append(user)
            row.append(movie)
            # Now add the other features to this data...
            row.append(train_averages['global']) # first feature
            # next 5 features are similar_users "movie" ratings
            row.extend(top_sim_users_ratings)
            # next 5 features are "user" ratings for similar_movies
            row.extend(top_sim_movies_ratings)
            # Avg_user rating
            row.append(train_averages['user'][user])
            # Avg_movie rating
            row.append(train_averages['movie'][movie])

            # finalley, The actual Rating of this user-movie pair...
            row.append(rating)
            count = count + 1
            final_data = pd.concat([final_data, pd.DataFrame([row])], ignore_index=True)
        
            if (count)%10000 == 0:
                # print(','.join(map(str, row)))
                print("Done for {} rows----- {}".format(count, datetime.now() - start))
                
print(datetime.now() - start)

Done for 10000 rows----- 0:00:01.087627
Done for 20000 rows----- 0:00:01.090104
Done for 30000 rows----- 0:00:01.089482
Done for 40000 rows----- 0:00:01.080652
Done for 50000 rows----- 0:00:01.091237
Done for 60000 rows----- 0:00:01.114781
Done for 70000 rows----- 0:00:01.078519
Done for 80000 rows----- 0:00:01.083465
Done for 90000 rows----- 0:00:01.100253
Done for 100000 rows----- 0:00:01.089602
Done for 110000 rows----- 0:00:01.098609
Done for 120000 rows----- 0:00:01.106828
Done for 130000 rows----- 0:00:01.079515
Done for 140000 rows----- 0:00:01.107912
Done for 150000 rows----- 0:00:01.076762
Done for 160000 rows----- 0:00:01.069529
Done for 170000 rows----- 0:00:01.086905
Done for 180000 rows----- 0:00:01.068193
Done for 190000 rows----- 0:00:01.085950
Done for 200000 rows----- 0:00:01.062238
Done for 210000 rows----- 0:00:01.084103
Done for 220000 rows----- 0:00:01.068498
Done for 230000 rows----- 0:00:01.116157
Done for 240000 rows----- 0:00:01.092231


KeyboardInterrupt: 

# Tiempo estimado mas del esperado

Una alternativa es reducir el tamaño de los datasets que se estan tomando para entrenar el modelo por lo que procederé a evaluar los datasets y tomar decisiones para efectos prácticos de aprendizaje

### Tiempo de entrenamiento demasiado largo por las dimensiones de los datasets, se procederá a reducir el train_data y el test_data a alrededor de un 0.3% del tamaño original dejando así una longitud cercana a 80000 en train_data y 15000 en test_data

In [39]:
final_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,2,1,3.533639,4.5,4.0,5.0,5.0,4.5,5.0,4.5,4.0,4.5,2.0,3.630435,3.895505,3.5
1,3,1,3.533639,4.0,4.0,3.5,4.0,2.5,4.0,3.5,4.0,2.0,4.0,3.697409,3.895505,4.0
2,4,1,3.533639,5.0,5.0,4.5,4.5,4.0,3.5,3.0,2.5,3.0,2.0,3.378099,3.895505,3.0
3,5,1,3.533639,4.0,4.0,5.0,4.0,5.0,5.0,3.0,4.0,5.0,4.0,3.752475,3.895505,4.0
4,8,1,3.533639,4.0,2.0,4.0,3.0,4.0,3.0,4.0,5.0,3.0,3.0,3.612903,3.895505,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242401,48651,22,3.533639,3.0,4.0,5.0,3.0,3.0,4.0,5.0,4.0,5.0,3.0,3.800000,3.318989,4.0
242402,48657,22,3.533639,4.0,4.0,4.0,3.0,5.0,4.0,3.0,3.0,3.0,1.0,3.729730,3.318989,3.0
242403,48670,22,3.533639,5.0,4.0,5.0,3.0,3.0,4.0,5.0,5.0,5.0,5.0,4.234234,3.318989,5.0
242404,48671,22,3.533639,3.0,3.0,4.0,3.0,4.0,5.0,4.0,4.0,4.0,5.0,3.820225,3.318989,5.0


In [None]:
train_user.

In [None]:
#Guardar al terminar
final_data.to_csv('final_data.csv', index=False)

In [None]:
len(train_users)

In [None]:
len(train_movies)

In [None]:
len(train_ratings)

In [None]:
len(train_users)*0.001

In [None]:
final_data.columns = ['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5', 'smr1', 'smr2', 'smr3', 'smr4',
                      'smr5', 'UAvg', 'MAvg', 'rating']

In [None]:
final_data.head()

In [None]:
final_data['mf_svd'] = train_pred_mf
final_data.head()

# Preparando la Información de Prueba

In [None]:
# Creando la matriz dispersa
test_sparse_matrix = sparse.csr_matrix((test_data.rating.values, (test_data.userId.values, test_data.movieId.values)))

In [None]:
# Promedio global de todas las peliculas por todos los usuarios

test_Averages = dict()

# Obtener el promedio global de los ratings en nuestro set de entrenamiento
test_global_average = test_sparse_matrix.sum()/test_sparse_matrix.count_nonzero()
test_averages['global'] = test_global_average
test_averages

## Esta repetido!!
#### Obtener el promedio de los usuarios en diccionario (key: user_id/movie_id, value: avg rating)

def get_average_ratings(sparse_matrix, of_users):
    
    # Promedio de ratings de usuarios/axes
    ax = 1 if of_users else 0 # 1 - User axes, 0 - Movie axes
    
    # ".A1" es para convertir Column_Matrix a 1-D numpy array
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    # Matriz booleana de matrices (Si un usuario calificó esa película o no)
    is_rated = sparse_matrix != 0
    # No de raitings de cada usuario o película
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    # max_user y max_movie ids en una matriz dispersa
    u,m = sparse_matrix.shape
    # Crear un diccionario de usuarios y sus ratings promedio
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i] for i in range(u if of_users else m) if no_of_ratings[i] != 0}
    
    # Devovler el diccionario the ratings promedio
    return average_ratings

In [None]:
# Ratings promedio por usuario
test_averages['user'] = get_average_raitings(test_sparse_matrix, od_users=True)
print('\n Rating promedio del usuario 27:', test_averages['user'][27])

In [None]:
# Rating promedio por pelicula
test_averages['movie'] = get_average_raitings(test_sparse_matrix, of_users=False)
print('\n Rating promedio de la pelicula 52:', test_averages['user'][52])

In [None]:
# Obtener usuarios, peliculas y ratings de la meustra de la matriz de entramiento dispersa
test_users, test_movies, test_ratings = sparse.find(test_sparse_matrix)

In [None]:
final_test_data = pd.DataFrame()
count = 0

for (user, movie, rating) in zip(test_users, test_movies, test_ratings):
    start = datetime.now()
    #print(user,movie)
    
    # Ratings de peliculas de "movie" por similaridad de usuarios de "user"
    # Calcular la similaridad de los usuarios de "user"
    user_sim = cosine_similarity(test_sparse_matrix[user], test_sparse_matrix). ravel()
    top_sim_users = user_sim.argsort()[::-1][1:]
    # Obtener los ratings de los usuarios mas similares para la pelicula
    top_ratings = test_sparse_matrix[top_sim_users, movie].toarray().ravel()
    # Vamos a hacer una longitud de "5" añadiendo el promedio de las peliculas
    top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
    top_sim_users_ratings.extend([test_averages['movie'][movie]]*(5 - len(top_sim_users_ratings)))
    #print(top_sim_users_ratings, end=" ")
    
    # Ratings de usuarios "user" para peliculas similares de "movie"
    # Calcular las peliculas similares de "movie"
    movie_sim = cosine_similarity(test_sparse_matrix[:, movie].T, test_sparse_matrix.T).ravel()
    top_sim_movies = movie_sim.argsort()[::-1][1:]
    # Obtener el rating de las peliculas mas similares calificada por el usuario
    top_ratings = test_sparse_matrix[user, top_sim_movies].toarray().ravel()
    # Vamos a hacerlo una longitud de "5" añadiendo el promedio de los usuarios
    top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
    top_sim_movies_ratings.extend([test_averages['user'][user]]*(5 - len(top_sim_movies_ratings)))
    #print(top_sim_movies_ratings, end=" ")
    
    # Preparar la fila para guardarlo en un archivo
    row = list()
    row.append(user)
    row.append(movie)
    row.append(train_averages['global'])
    row.extend(top_sim_users_ratings)
    row.extend(top_sim_movies_ratings)
    row.append(train_averages['user'][user])
    row.append(train_averages['movie'][movie])

    row.append(rating)
    count = count + 1
    final_data = pd.concat([final_data, pd.DataFrame([row])], ignore_index=True)

    if (count)%10000 == 0:
        # print(','.join(map(str, row)))
        print("Done for {} rows----- {}".format(count, datetime.now() - start))
                
print(datetime.now() - start)

In [None]:
final_test_data.columns = ['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5', 'smr1', 'smr2', 'smr3', 'smr4',
                      'smr5', 'UAvg', 'MAvg', 'rating']