Info
https://notebooks.gesis.org/binder/jupyter/user/ipython-ipython-in-depth-p1yfe52k/notebooks/binder/Index.ipynb

Project
https://towardsdatascience.com/how-to-build-a-movie-recommendation-system-67e321339109

Other resource
https://github.com/nishantml/NETFLIX-MOVIE-RECOMMENDATION-SYSTEM/blob/master/Netflix_Movie.ipynb

In [3]:
import pandas as pd
data = pd.read_csv("ml-latest-small/ratings.csv")
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
data = data.drop('timestamp', axis=1)
data.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# Splitting the data

In [5]:
data.shape

(100836, 3)

In [6]:
train_data = data.iloc[:int(data.shape[0]*0.80)]
test_data = data.iloc[int(data.shape[0]*0.80):]

In [7]:
train_data.shape

(80668, 3)

In [8]:
test_data.shape

(20168, 3)

# Matrix Factorization

In [9]:
from surprise import SVD
import numpy as np
import surprise
from surprise import Reader, Dataset
from scipy import sparse

In [10]:
# Es para especificar como leer el dataframe
# Para nuestro dataframe no necesitamos especificar algo extra...
reader = Reader(rating_scale=(1,5))

# Crear información de entrenamiento del dataframe...
train_data_mf = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

# Construir el trainset de traindara.., It is of dataset format from surprise library..
trainset = train_data_mf.build_full_trainset() 

In [11]:
# Crear un testdata para el dataframe...
test_data_mf = Dataset.load_from_df(test_data[['userId', 'movieId', 'rating']], reader)

# Construir un testset del traindata.., It is of dataset format from surprise library..
testset = test_data_mf.build_full_trainset() 

In [12]:
svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2338cdf5f10>

In [13]:
# Guradando las predicciones de entrenamiento y obteniendolas del trainset
train_preds = svd.test(trainset.build_testset())

train_pred_mf = np.array([pred.est for pred in train_preds])

In [14]:
train_pred_mf

array([4.30619518, 3.94173484, 4.54355566, ..., 3.06422925, 3.25430315,
       2.69500677])

In [15]:
# Guardando las predicciones de prueba y obteniendolas del testset
test_preds = svd.test(testset.build_testset())

test_pred_mf = np.array([pred.est for pred in test_preds])

In [16]:
test_pred_mf

array([3.42586544, 3.22903645, 3.02600702, ..., 3.56157074, 3.92178227,
       3.32054949])

# Preparando el data frame de entrenamiento

In [17]:
# Creando una matriz dispersa (sparse matrix)

train_sparse_matrix = sparse.csr_matrix((train_data.rating.values, (train_data.userId.values, train_data.movieId.values)))

In [18]:
#Promedio global de las peliculas por los usuarios
train_averages = dict()

#Obtener el promedio glob al the raitings in nuestro set de entrenamiento
train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average
train_averages

{'global': 3.5199769425298757}

In [19]:
# Obtener el promedio de los usuarios en un diccionario (key: user_id / movie_id, value: abg raiting)

def get_average_raitings(sparse_matrix, of_users):
    
    # Promedio de ratings de usuarios/axes
    ax = 1 if of_users else 0 # 1 - User axes, 0 - Movie axes
    
    # ".A1" es para convertir Column_Matrix a 1-D numpy array
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    # Matriz booleana de matrices (Si un usuario calificó esa película o no)
    is_rated = sparse_matrix != 0
    # No de raitings de cada usuario o película
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    # max_user y max_movie ids en una matriz dispersa
    u,m = sparse_matrix.shape
    # Crear un diccionario de usuarios y sus ratings promedio
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i] for i in range(u if of_users else m) if no_of_ratings[i] != 0}
    
    # Devovler el diccionario the ratings promedio
    return average_ratings

In [20]:
# Raiting promedio obtenido por un usario
train_averages['user'] = get_average_raitings(train_sparse_matrix, of_users=True)
print('\nRating promedio del usuario 25: ', train_averages['user'][25])


Rating promedio del usuario 25:  4.8076923076923075


In [21]:
# Raiting promedio para una película
train_averages['movie'] = get_average_raitings(train_sparse_matrix, of_users=False)
print('\nRating promedio de la película 40: ', train_averages['movie'][40])


Rating promedio de la película 40:  3.5


In [22]:
# Obtener usuarios, películas and ratings de nuestra matriz dispersa de meustras de entrenamiento
train_users, train_movies, train_ratings = sparse.find(train_sparse_matrix)

In [23]:
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity

final_data = pd.DataFrame()
count = 0
start = datetime.now()

for (user, movie, rating)  in zip(train_users, train_movies, train_ratings):
            #print(user, movie)    
            
            # Calificacion de "movie" por similaridad de usuario de "user"
            # Calcular la similaridad de usariores de "user"        
            user_sim = cosine_similarity(train_sparse_matrix[user], train_sparse_matrix).ravel()
            top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # Obtener los ratings de los usuarios más similares para esta película
            top_ratings = train_sparse_matrix[top_sim_users, movie].toarray().ravel()
            # Vamos a hacer una longitud de "5" agregando el promedio de las peliculas
            top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_users_ratings.extend([train_averages['movie'][movie]]*(5 - len(top_sim_users_ratings)))
            # print(top_sim_users_ratings, end=" ")    


            # Ratings de "user" para peliculas similares de "movie"
            # Calcular las peliculas similares de "movie"        
            movie_sim = cosine_similarity(train_sparse_matrix[:,movie].T, train_sparse_matrix.T).ravel()
            top_sim_movies = movie_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # Obtener los ratings de las películas más similares calificados por el usuario
            top_ratings = train_sparse_matrix[user, top_sim_movies].toarray().ravel()
            # Vamos a hacer una longitud de "5" agregando el promedio de los usuarios
            top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_movies_ratings.extend([train_averages['user'][user]]*(5-len(top_sim_movies_ratings))) 
            # print(top_sim_movies_ratings, end=" : -- ")

            # Preparar la fila para guardarlo en un archivo
            row = list()
            row.append(user)
            row.append(movie)
            # Now add the other features to this data...
            row.append(train_averages['global']) # first feature
            # next 5 features are similar_users "movie" ratings
            row.extend(top_sim_users_ratings)
            # next 5 features are "user" ratings for similar_movies
            row.extend(top_sim_movies_ratings)
            # Avg_user rating
            row.append(train_averages['user'][user])
            # Avg_movie rating
            row.append(train_averages['movie'][movie])

            # finalley, The actual Rating of this user-movie pair...
            row.append(rating)
            count = count + 1
            final_data = pd.concat([final_data, pd.DataFrame([row])], ignore_index=True)
        
            if (count)%10000 == 0:
                # print(','.join(map(str, row)))
                print("Done for {} rows----- {}".format(count, datetime.now() - start))
                
print("Tiempo total tomado: ",datetime.now() - start)

Done for 10000 rows----- 0:02:29.580437
Done for 20000 rows----- 0:04:57.404314
Done for 30000 rows----- 0:07:47.246384
Done for 40000 rows----- 0:10:40.921480
Done for 50000 rows----- 0:13:50.270658
Done for 60000 rows----- 0:16:57.331961
Done for 70000 rows----- 0:19:54.456105
Done for 80000 rows----- 0:22:48.785604
Tiempo total tomado:  0:22:59.357380


# Tiempo estimado mas del esperado

Una alternativa es reducir el tamaño de los datasets que se estan tomando para entrenar el modelo por lo que procederé a evaluar los datasets y tomar decisiones para efectos prácticos de aprendizaje

### Tiempo de entrenamiento demasiado largo por las dimensiones de los datasets, se procederá a reducir el train_data y el test_data a alrededor de un 0.3% del tamaño original dejando así una longitud cercana a 80000 en train_data y 15000 en test_data

In [25]:
final_data.columns = ['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5', 'smr1', 'smr2', 'smr3', 'smr4',
                      'smr5', 'UAvg', 'MAvg', 'rating']

In [26]:
final_data

Unnamed: 0,user,movie,GAvg,sur1,sur2,sur3,sur4,sur5,smr1,smr2,smr3,smr4,smr5,UAvg,MAvg,rating
0,1,1,3.519977,2.0,5.0,4.0,4.0,4.5,3.0,4.0,3.0,5.0,5.0,4.366379,3.954545,4.0
1,5,1,3.519977,4.0,5.0,4.0,4.0,5.0,4.0,3.0,3.0,5.0,5.0,3.636364,3.954545,4.0
2,7,1,3.519977,4.0,4.0,5.0,4.5,4.0,4.5,4.5,5.0,4.0,3.0,3.230263,3.954545,4.5
3,15,1,3.519977,5.0,3.0,4.0,4.0,4.0,3.5,3.0,5.0,3.0,3.0,3.448148,3.954545,2.5
4,17,1,3.519977,4.0,5.0,4.0,4.0,4.5,4.0,4.5,5.0,5.0,5.0,4.209524,3.954545,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80663,184,193581,3.519977,4.0,4.0,4.0,4.0,4.0,5.0,3.5,4.0,3.5,4.5,3.705224,4.000000,4.0
80664,184,193583,3.519977,3.5,3.5,3.5,3.5,3.5,5.0,3.5,4.0,3.5,4.5,3.705224,3.500000,3.5
80665,184,193585,3.519977,3.5,3.5,3.5,3.5,3.5,5.0,3.5,4.0,3.5,4.5,3.705224,3.500000,3.5
80666,184,193587,3.519977,3.5,3.5,3.5,3.5,3.5,5.0,3.5,4.0,3.5,4.5,3.705224,3.500000,3.5


In [27]:
#Guardar al terminar
final_data.to_csv('final_data.csv', index=False)

In [28]:
final_data['mf_svd'] = train_pred_mf
final_data.head()

Unnamed: 0,user,movie,GAvg,sur1,sur2,sur3,sur4,sur5,smr1,smr2,smr3,smr4,smr5,UAvg,MAvg,rating,mf_svd
0,1,1,3.519977,2.0,5.0,4.0,4.0,4.5,3.0,4.0,3.0,5.0,5.0,4.366379,3.954545,4.0,4.306195
1,5,1,3.519977,4.0,5.0,4.0,4.0,5.0,4.0,3.0,3.0,5.0,5.0,3.636364,3.954545,4.0,3.941735
2,7,1,3.519977,4.0,4.0,5.0,4.5,4.0,4.5,4.5,5.0,4.0,3.0,3.230263,3.954545,4.5,4.543556
3,15,1,3.519977,5.0,3.0,4.0,4.0,4.0,3.5,3.0,5.0,3.0,3.0,3.448148,3.954545,2.5,4.67738
4,17,1,3.519977,4.0,5.0,4.0,4.0,4.5,4.0,4.5,5.0,5.0,5.0,4.209524,3.954545,4.5,4.810359


# Preparando la Información de Prueba

In [None]:
# Creando la matriz dispersa
test_sparse_matrix = sparse.csr_matrix((test_data.rating.values, (test_data.userId.values, test_data.movieId.values)))

In [None]:
# Promedio global de todas las peliculas por todos los usuarios

test_Averages = dict()

# Obtener el promedio global de los ratings en nuestro set de entrenamiento
test_global_average = test_sparse_matrix.sum()/test_sparse_matrix.count_nonzero()
test_averages['global'] = test_global_average
test_averages

## Esta repetido!!
#### Obtener el promedio de los usuarios en diccionario (key: user_id/movie_id, value: avg rating)

def get_average_ratings(sparse_matrix, of_users):
    
    # Promedio de ratings de usuarios/axes
    ax = 1 if of_users else 0 # 1 - User axes, 0 - Movie axes
    
    # ".A1" es para convertir Column_Matrix a 1-D numpy array
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    # Matriz booleana de matrices (Si un usuario calificó esa película o no)
    is_rated = sparse_matrix != 0
    # No de raitings de cada usuario o película
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    # max_user y max_movie ids en una matriz dispersa
    u,m = sparse_matrix.shape
    # Crear un diccionario de usuarios y sus ratings promedio
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i] for i in range(u if of_users else m) if no_of_ratings[i] != 0}
    
    # Devovler el diccionario the ratings promedio
    return average_ratings

In [None]:
# Ratings promedio por usuario
test_averages['user'] = get_average_raitings(test_sparse_matrix, od_users=True)
print('\n Rating promedio del usuario 27:', test_averages['user'][27])

In [None]:
# Rating promedio por pelicula
test_averages['movie'] = get_average_raitings(test_sparse_matrix, of_users=False)
print('\n Rating promedio de la pelicula 52:', test_averages['user'][52])

In [None]:
# Obtener usuarios, peliculas y ratings de la meustra de la matriz de entramiento dispersa
test_users, test_movies, test_ratings = sparse.find(test_sparse_matrix)

In [None]:
final_test_data = pd.DataFrame()
count = 0

for (user, movie, rating) in zip(test_users, test_movies, test_ratings):
    start = datetime.now()
    #print(user,movie)
    
    # Ratings de peliculas de "movie" por similaridad de usuarios de "user"
    # Calcular la similaridad de los usuarios de "user"
    user_sim = cosine_similarity(test_sparse_matrix[user], test_sparse_matrix). ravel()
    top_sim_users = user_sim.argsort()[::-1][1:]
    # Obtener los ratings de los usuarios mas similares para la pelicula
    top_ratings = test_sparse_matrix[top_sim_users, movie].toarray().ravel()
    # Vamos a hacer una longitud de "5" añadiendo el promedio de las peliculas
    top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
    top_sim_users_ratings.extend([test_averages['movie'][movie]]*(5 - len(top_sim_users_ratings)))
    #print(top_sim_users_ratings, end=" ")
    
    # Ratings de usuarios "user" para peliculas similares de "movie"
    # Calcular las peliculas similares de "movie"
    movie_sim = cosine_similarity(test_sparse_matrix[:, movie].T, test_sparse_matrix.T).ravel()
    top_sim_movies = movie_sim.argsort()[::-1][1:]
    # Obtener el rating de las peliculas mas similares calificada por el usuario
    top_ratings = test_sparse_matrix[user, top_sim_movies].toarray().ravel()
    # Vamos a hacerlo una longitud de "5" añadiendo el promedio de los usuarios
    top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
    top_sim_movies_ratings.extend([test_averages['user'][user]]*(5 - len(top_sim_movies_ratings)))
    #print(top_sim_movies_ratings, end=" ")
    
    # Preparar la fila para guardarlo en un archivo
    row = list()
    row.append(user)
    row.append(movie)
    row.append(train_averages['global'])
    row.extend(top_sim_users_ratings)
    row.extend(top_sim_movies_ratings)
    row.append(train_averages['user'][user])
    row.append(train_averages['movie'][movie])

    row.append(rating)
    count = count + 1
    final_data = pd.concat([final_data, pd.DataFrame([row])], ignore_index=True)

    if (count)%10000 == 0:
        # print(','.join(map(str, row)))
        print("Done for {} rows----- {}".format(count, datetime.now() - start))
                
print(datetime.now() - start)

In [None]:
final_test_data.columns = ['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5', 'smr1', 'smr2', 'smr3', 'smr4',
                      'smr5', 'UAvg', 'MAvg', 'rating']