In [1]:
import pandas as pd
data = pd.read_csv("ml-latest-small/ratings.csv")
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [2]:
data = data.drop('timestamp', axis=1)
data.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# Splitting the data

In [3]:
data.shape

(100836, 3)

In [4]:
train_data = data.iloc[:int(data.shape[0]*0.80)]
test_data = data.iloc[int(data.shape[0]*0.80):]

In [5]:
train_data.shape

(80668, 3)

In [6]:
test_data.shape

(20168, 3)

# Matrix Factorization

In [7]:
from surprise import SVD
import numpy as np
import surprise
from surprise import Reader, Dataset
from scipy import sparse

In [8]:
# Es para especificar como leer el dataframe
# Para nuestro dataframe no necesitamos especificar algo extra...
reader = Reader(rating_scale=(1,5))

# Crear información de entrenamiento del dataframe...
train_data_mf = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

# Construir el trainset de traindara.., It is of dataset format from surprise library..
trainset = train_data_mf.build_full_trainset() 

In [9]:
# Crear un testdata para el dataframe...
test_data_mf = Dataset.load_from_df(test_data[['userId', 'movieId', 'rating']], reader)

# Construir un testset del traindata.., It is of dataset format from surprise library..
testset = test_data_mf.build_full_trainset() 

In [10]:
svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f9628b11c0>

In [11]:
# Guradando las predicciones de entrenamiento y obteniendolas del trainset
train_preds = svd.test(trainset.build_testset())

train_pred_mf = np.array([pred.est for pred in train_preds])

In [12]:
train_pred_mf

array([4.30619518, 3.94173484, 4.54355566, ..., 3.06422925, 3.25430315,
       2.69500677])

In [13]:
# Guardando las predicciones de prueba y obteniendolas del testset
test_preds = svd.test(testset.build_testset())

test_pred_mf = np.array([pred.est for pred in test_preds])

In [14]:
test_pred_mf

array([3.42586544, 3.22903645, 3.02600702, ..., 3.56157074, 3.92178227,
       3.32054949])

# Preparando el data frame de entrenamiento

In [15]:
# Creando una matriz dispersa (sparse matrix)

train_sparse_matrix = sparse.csr_matrix((train_data.rating.values, (train_data.userId.values, train_data.movieId.values)))

In [16]:
#Promedio global de las peliculas por los usuarios
train_averages = dict()

#Obtener el promedio glob al the raitings in nuestro set de entrenamiento
train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average
train_averages

{'global': 3.5199769425298757}

In [17]:
# Obtener el promedio de los usuarios en un diccionario (key: user_id / movie_id, value: abg raiting)

def get_average_raitings(sparse_matrix, of_users):
    
    # Promedio de ratings de usuarios/axes
    ax = 1 if of_users else 0 # 1 - User axes, 0 - Movie axes
    
    # ".A1" es para convertir Column_Matrix a 1-D numpy array
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    # Matriz booleana de matrices (Si un usuario calificó esa película o no)
    is_rated = sparse_matrix != 0
    # No de raitings de cada usuario o película
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    # max_user y max_movie ids en una matriz dispersa
    u,m = sparse_matrix.shape
    # Crear un diccionario de usuarios y sus ratings promedio
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i] for i in range(u if of_users else m) if no_of_ratings[i] != 0}
    
    # Devovler el diccionario the ratings promedio
    return average_ratings

In [18]:
# Raiting promedio obtenido por un usario
train_averages['user'] = get_average_raitings(train_sparse_matrix, of_users=True)
print('\nRating promedio del usuario 25: ', train_averages['user'][25])


Rating promedio del usuario 25:  4.8076923076923075


In [19]:
# Raiting promedio para una película
train_averages['movie'] = get_average_raitings(train_sparse_matrix, of_users=False)
print('\nRating promedio de la película 40: ', train_averages['movie'][40])


Rating promedio de la película 40:  3.5


In [20]:
# Obtener usuarios, películas and ratings de nuestra matriz dispersa de meustras de entrenamiento
train_users, train_movies, train_ratings = sparse.find(train_sparse_matrix)

In [21]:
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity

final_data = pd.DataFrame()
count = 0
start = datetime.now()

for (user, movie, rating)  in zip(train_users, train_movies, train_ratings):
            #print(user, movie)    
            
            # Calificacion de "movie" por similaridad de usuario de "user"
            # Calcular la similaridad de usariores de "user"        
            user_sim = cosine_similarity(train_sparse_matrix[user], train_sparse_matrix).ravel()
            top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # Obtener los ratings de los usuarios más similares para esta película
            top_ratings = train_sparse_matrix[top_sim_users, movie].toarray().ravel()
            # Vamos a hacer una longitud de "5" agregando el promedio de las peliculas
            top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_users_ratings.extend([train_averages['movie'][movie]]*(5 - len(top_sim_users_ratings)))
            # print(top_sim_users_ratings, end=" ")    


            # Ratings de "user" para peliculas similares de "movie"
            # Calcular las peliculas similares de "movie"        
            movie_sim = cosine_similarity(train_sparse_matrix[:,movie].T, train_sparse_matrix.T).ravel()
            top_sim_movies = movie_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # Obtener los ratings de las películas más similares calificados por el usuario
            top_ratings = train_sparse_matrix[user, top_sim_movies].toarray().ravel()
            # Vamos a hacer una longitud de "5" agregando el promedio de los usuarios
            top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_movies_ratings.extend([train_averages['user'][user]]*(5-len(top_sim_movies_ratings))) 
            # print(top_sim_movies_ratings, end=" : -- ")

            # Preparar la fila para guardarlo en un archivo
            row = list()
            row.append(user)
            row.append(movie)
            # Now add the other features to this data...
            row.append(train_averages['global']) # first feature
            # next 5 features are similar_users "movie" ratings
            row.extend(top_sim_users_ratings)
            # next 5 features are "user" ratings for similar_movies
            row.extend(top_sim_movies_ratings)
            # Avg_user rating
            row.append(train_averages['user'][user])
            # Avg_movie rating
            row.append(train_averages['movie'][movie])

            # finalley, The actual Rating of this user-movie pair...
            row.append(rating)
            count = count + 1
            final_data = pd.concat([final_data, pd.DataFrame([row])], ignore_index=True)
        
            if (count)%10000 == 0:
                # print(','.join(map(str, row)))
                print("Done for {} rows----- {}".format(count, datetime.now() - start))
                
print("Tiempo total tomado: ",datetime.now() - start)

Done for 10000 rows----- 0:02:26.960641
Done for 20000 rows----- 0:04:55.033775
Done for 30000 rows----- 0:07:26.806999
Done for 40000 rows----- 0:09:59.756585
Done for 50000 rows----- 0:12:35.584303
Done for 60000 rows----- 0:15:13.650237
Done for 70000 rows----- 0:17:55.820979
Done for 80000 rows----- 0:20:39.258870
Tiempo total tomado:  0:20:49.842813


In [22]:
final_data.columns = ['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5', 'smr1', 'smr2', 'smr3', 'smr4',
                      'smr5', 'UAvg', 'MAvg', 'rating']

In [23]:
final_data

Unnamed: 0,user,movie,GAvg,sur1,sur2,sur3,sur4,sur5,smr1,smr2,smr3,smr4,smr5,UAvg,MAvg,rating
0,1,1,3.519977,2.0,5.0,4.0,4.0,4.5,3.0,4.0,3.0,5.0,5.0,4.366379,3.954545,4.0
1,5,1,3.519977,4.0,5.0,4.0,4.0,5.0,4.0,3.0,3.0,5.0,5.0,3.636364,3.954545,4.0
2,7,1,3.519977,4.0,4.0,5.0,4.5,4.0,4.5,4.5,5.0,4.0,3.0,3.230263,3.954545,4.5
3,15,1,3.519977,5.0,3.0,4.0,4.0,4.0,3.5,3.0,5.0,3.0,3.0,3.448148,3.954545,2.5
4,17,1,3.519977,4.0,5.0,4.0,4.0,4.5,4.0,4.5,5.0,5.0,5.0,4.209524,3.954545,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80663,184,193581,3.519977,4.0,4.0,4.0,4.0,4.0,5.0,3.5,4.0,3.5,4.5,3.705224,4.000000,4.0
80664,184,193583,3.519977,3.5,3.5,3.5,3.5,3.5,5.0,3.5,4.0,3.5,4.5,3.705224,3.500000,3.5
80665,184,193585,3.519977,3.5,3.5,3.5,3.5,3.5,5.0,3.5,4.0,3.5,4.5,3.705224,3.500000,3.5
80666,184,193587,3.519977,3.5,3.5,3.5,3.5,3.5,5.0,3.5,4.0,3.5,4.5,3.705224,3.500000,3.5


In [24]:
final_data['mf_svd'] = train_pred_mf
final_data.head()

Unnamed: 0,user,movie,GAvg,sur1,sur2,sur3,sur4,sur5,smr1,smr2,smr3,smr4,smr5,UAvg,MAvg,rating,mf_svd
0,1,1,3.519977,2.0,5.0,4.0,4.0,4.5,3.0,4.0,3.0,5.0,5.0,4.366379,3.954545,4.0,4.306195
1,5,1,3.519977,4.0,5.0,4.0,4.0,5.0,4.0,3.0,3.0,5.0,5.0,3.636364,3.954545,4.0,3.941735
2,7,1,3.519977,4.0,4.0,5.0,4.5,4.0,4.5,4.5,5.0,4.0,3.0,3.230263,3.954545,4.5,4.543556
3,15,1,3.519977,5.0,3.0,4.0,4.0,4.0,3.5,3.0,5.0,3.0,3.0,3.448148,3.954545,2.5,4.67738
4,17,1,3.519977,4.0,5.0,4.0,4.0,4.5,4.0,4.5,5.0,5.0,5.0,4.209524,3.954545,4.5,4.810359


In [25]:
#Guardar al terminar
final_data.to_csv('final_data.csv', index=False)

# Preparando la Información de Prueba

In [26]:
# Creando la matriz dispersa
test_sparse_matrix = sparse.csr_matrix((test_data.rating.values, (test_data.userId.values, test_data.movieId.values)))

In [27]:
# Promedio global de todas las peliculas por todos los usuarios

test_averages = dict()

# Obtener el promedio global de los ratings en nuestro set de entrenamiento
test_global_average = test_sparse_matrix.sum()/test_sparse_matrix.count_nonzero()
test_averages['global'] = test_global_average
test_averages

{'global': 3.4278808012693376}

In [28]:
# Ratings promedio por usuario
test_averages['user'] = get_average_raitings(test_sparse_matrix, of_users=True)
print('\n Rating promedio del usuario 524:', test_averages['user'][524])


 Rating promedio del usuario 524: 3.4580152671755724


In [29]:
# Rating promedio por pelicula
test_averages['movie'] = get_average_raitings(test_sparse_matrix, of_users=False)
print('\n Rating promedio de la pelicula 528:', test_averages['user'][528])


 Rating promedio de la pelicula 528: 3.4726027397260273


In [30]:
# Obtener usuarios, peliculas y ratings de la meustra de la matriz de entramiento dispersa
test_users, test_movies, test_ratings = sparse.find(test_sparse_matrix)

In [35]:
final_test_data = pd.DataFrame()
count = 0
start = datetime.now()

for (user, movie, rating) in zip(test_users, test_movies, test_ratings):
    #print(user,movie)
    
    # Ratings de peliculas de "movie" por similaridad de usuarios de "user"
    # Calcular la similaridad de los usuarios de "user"
    user_sim = cosine_similarity(test_sparse_matrix[user], test_sparse_matrix). ravel()
    top_sim_users = user_sim.argsort()[::-1][1:]
    # Obtener los ratings de los usuarios mas similares para la pelicula
    top_ratings = test_sparse_matrix[top_sim_users, movie].toarray().ravel()
    # Vamos a hacer una longitud de "5" añadiendo el promedio de las peliculas
    top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
    top_sim_users_ratings.extend([test_averages['movie'][movie]]*(5 - len(top_sim_users_ratings)))
    #print(top_sim_users_ratings, end=" ")
    
    # Ratings de usuarios "user" para peliculas similares de "movie"
    # Calcular las peliculas similares de "movie"
    movie_sim = cosine_similarity(test_sparse_matrix[:, movie].T, test_sparse_matrix.T).ravel()
    top_sim_movies = movie_sim.argsort()[::-1][1:]
    # Obtener el rating de las peliculas mas similares calificada por el usuario
    top_ratings = test_sparse_matrix[user, top_sim_movies].toarray().ravel()
    # Vamos a hacerlo una longitud de "5" añadiendo el promedio de los usuarios
    top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
    top_sim_movies_ratings.extend([test_averages['user'][user]]*(5 - len(top_sim_movies_ratings)))
    #print(top_sim_movies_ratings, end=" ")
    
    # Preparar la fila para guardarlo en un archivo
    row = list()
    row.append(user)
    row.append(movie)
    row.append(train_averages['global'])
    row.extend(top_sim_users_ratings)
    row.extend(top_sim_movies_ratings)
    row.append(test_averages['user'][user])
    row.append(test_averages['movie'][movie])

    row.append(rating)
    count = count + 1
    final_test_data = pd.concat([final_test_data, pd.DataFrame([row])], ignore_index=True)

    if (count)%10000 == 0:
        # print(','.join(map(str, row)))
        print("Done for {} rows----- {}".format(count, datetime.now() - start))
                
print(datetime.now() - start)

Done for 10000 rows----- 0:02:03.667843
Done for 20000 rows----- 0:04:09.741937
0:04:11.751508


In [36]:
final_test_data.columns = ['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5', 'smr1', 'smr2', 'smr3', 'smr4',
                      'smr5', 'UAvg', 'MAvg', 'rating']

In [37]:
final_test_data

Unnamed: 0,user,movie,GAvg,sur1,sur2,sur3,sur4,sur5,smr1,smr2,smr3,smr4,smr5,UAvg,MAvg,rating
0,514,1,3.519977,4.0,4.00,4.00,4.00,4.00,4.0,4.0,4.0,2.5,5.0,3.311083,3.769231,4.0
1,517,1,3.519977,4.0,4.00,4.00,2.50,4.00,3.5,2.0,3.0,3.5,5.0,2.386250,3.769231,4.0
2,522,1,3.519977,5.0,4.00,4.00,4.00,3.00,3.5,4.0,5.0,5.0,5.0,3.830000,3.769231,3.0
3,524,1,3.519977,5.0,4.00,5.00,3.00,4.00,3.0,3.0,3.0,5.0,5.0,3.458015,3.769231,4.0
4,525,1,3.519977,4.0,3.00,5.00,2.50,4.00,4.0,4.0,4.0,4.5,4.0,3.542000,3.769231,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20163,586,187593,3.519977,3.5,3.75,3.75,3.75,3.75,5.0,3.5,3.5,4.0,5.0,4.365385,3.750000,4.0
20164,514,187595,3.519977,5.0,4.00,4.00,4.00,4.00,3.5,2.0,4.0,2.0,3.0,3.311083,4.000000,3.0
20165,586,187595,3.519977,3.0,4.00,4.00,4.00,4.00,4.0,4.5,4.0,4.0,4.0,4.365385,4.000000,5.0
20166,596,188301,3.519977,4.0,4.00,4.00,4.00,4.00,4.0,4.5,2.5,4.0,4.0,3.495134,4.000000,4.0


In [39]:
final_test_data['mf_svd'] = test_pred_mf

In [42]:
final_test_data.to_csv('final_test_data.csv', index=False)

# Creando el XGBoost
### Optimized distributed gradient  boosting library

In [55]:
def get_error_metrics(y_true, y_pred):
    rmse = np.sqrt(np.mean([(y_true[i] - y_pred[i])**2 for i in range(len(y_pred))]))
    mape = np.mean(np.abs((y_true - y_pred)/y_true)) * 100
    return rmse, mape

In [44]:
# Preparando los datos de entrenamiento
x_train = final_data.drop(['user', 'movie', 'rating'], axis = 1)
y_train = final_data['rating']

In [45]:
# Preparando los datos de prueba
x_test = final_test_data.drop(['user', 'movie', 'rating'], axis = 1)
y_test = final_test_data['rating']

In [49]:
import xgboost as xgb

In [51]:
# Inicializando el modelo XGBoost
xgb_model = xgb.XGBRegressor( n_jobs = 13, random_state = 15, n_estimators = 100)

# Diccionarios para guardar los resultados de entrenamiento y prueba
train_results = dict()
test_results = dict()

# Ajustar el modelo
print('Entrenando el modelo...')
start = datetime.now()
xgb_model.fit(x_train, y_train, eval_metric = 'rmse')
print('Hecho. Tiempo tomado: {}\n'.format(datetime.now() - start))
print('Terminado')

Entrenando el modelo...
Hecho. Tiempo tomado: 0:00:00.178554

Terminado


In [56]:
# Obtener las predicciones del modelo entrenado
print('Evaluando el modelo con train_data')
start = datetime.now()
y_train_pred = xgb_model.predict(x_train)

# Obtener el rmse y mape de la informacion de entrenamiento
rmse_train, mape_train = get_error_metrics(y_train.values, y_train_pred)

# Guardar los resultados en el diccionario train_results
train_results = {'rmse': rmse_train, 'mape': mape_train, 'predictions': y_train_pred}

Evaluando el modelo con train_data


In [57]:
train_results

{'rmse': 0.6749454808339831,
 'mape': 20.440257050215205,
 'predictions': array([4.061627 , 4.1507144, 4.003096 , ..., 3.7413933, 3.7257297,
        4.1561704], dtype=float32)}

In [59]:
# Obtener la data de prediccion de prueba y calcular rl rmse y mape
print('Evaluando Test Data')
y_test_pred = xgb_model.predict(x_test)
rmse_test, mape_test = get_error_metrics(y_true = y_test.values, y_pred = y_test_pred)

# Guardar los resultados en el diccionario test_results
test_results = {'rmse': rmse_test, 'mape': mape_test, 'predictions': y_test_pred}

Evaluando Test Data


In [60]:
test_results

{'rmse': 0.6684609078597152,
 'mape': 19.76102533371047,
 'predictions': array([3.7305498, 3.083081 , 4.153682 , ..., 4.3296375, 4.243002 ,
        3.0668445], dtype=float32)}