In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'movie-recommender-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4268467%2F7350299%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240913%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240913T202507Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D42fef65701c972cd857d804685082618e133e0371119c8f78950b632b28f366975da87ad15d2fea948b86cbe6240ee63ccfe87d0a562a9a2d4854704cdad9108d8691ac245b6208e4957c167ddc1fdd9c0ad7641c52d284742757654b9f138bc2a5a01a7eb2ae159580d09d681377b3367904dd828adb7d30db97b32723f0375431b2ff261dce787acb7a96fad1fcb4a58dcb78e0bf5f6801fe6ea1193459ac274e02e2d184e76f75057a341375fc41f1a6cabee193254c3fa700bf20b8fd15d16dc3bae7f74c044b8fbf982469e71fcccdfa6a4991c99f46dea97733cf802718e4b4872489f8f3bbdee579df80577862cc1c5c690c41c409bcdaef1eef1e6d9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Movie Recommender system (2023)
Predict the ratings of users based on MovieTweetings dataset

### Assignment
Implement a collaborative filtering algorithm for predicting movie ratings.

#### Evaluation
## Goal
Predict the movie ratings of Twitter users. Predict the corresponding rating between 0 and 10
## Metric
The evaluation metric for the competition (Movie recommender system (2023)) to use is the Root-Mean-Square-Error (RMSE)

#### Competition
* Competition name: Movie recommender system (2023)
* Competition host: Nacho
* Competitor name: Andreeo Gonzalez

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movie-recommender-dataset/movies.csv
/kaggle/input/movie-recommender-dataset/raitings_training.csv
/kaggle/input/movie-recommender-dataset/users.csv
/kaggle/input/movie-recommender-dataset/test.csv


In [None]:
# Load and Build Dataset
movies = pd.read_csv('/kaggle/input/movie-recommender-dataset/movies.csv')
ratings_training = pd.read_csv('/kaggle/input/movie-recommender-dataset/raitings_training.csv')
users  = pd.read_csv('/kaggle/input/movie-recommender-dataset/users.csv')
test_data = pd.read_csv('/kaggle/input/movie-recommender-dataset/test.csv',sep='-')

#default rating to all records of test_data
test_data['rating']= 5
#se establece un rating de 5 (ni "no me gusta" ni "me gusta") por defecto a todos, para luego en la etapa de procesamiento de datos
#solo aquellas peliculas de las cuales no existe ningun dato en los datos de entrenamiento se les añada un 5 por defecto
#de esta manera mejoro mucho el problema de sesgo que me generaban estos datos

#processed movie data (cleaning duplicates and movies where genre is empty)
movies = movies.sort_values('genre', na_position='last').drop_duplicates(['itemid', 'title'], keep='first')
#check_duplicates = [movies_data.duplicated(subset=['itemid'], keep=False)]

#ratings_training_data =  pd.concat([ratings_training_data,test_data])

In [None]:
def preprocess_and_concat(ratings_training_data, test_data):

    new_users = set(test_data['itemid'].unique()) - set(ratings_training_data['userid'].unique())

    mean_ratings = ratings_training_data.groupby('itemid')['rating'].mean().reset_index()


    def assign_rating(row):
        if row['userid'] in new_users:

            if row['itemid'] in mean_ratings['itemid'].unique():
                return round(mean_ratings[mean_ratings['itemid'] == row['itemid']]['rating'].values[0],0)
            else:
                # set default value to movies that not exist in ratings_training_data
                return 5
        else:
            return row['rating']

    test_data['rating'] = test_data.apply(assign_rating, axis=1)

    concatenated_data = pd.concat([ratings_training_data, test_data])

    return concatenated_data

In [None]:
train = preprocess_and_concat(ratings_training, test_data)

In [None]:
#print rows head
print(train.head())

   userid   itemid  rating
0   20432  2265171     9.0
1   59416  2119532     8.0
2   23025   434409     7.0
3   37262  3064298     7.0
4   65696   462229     9.0


In [None]:
#distribution ratings and nº ratings per user
Counter(train.rating)

Counter({8.0: 198829,
         7.0: 184317,
         5.0: 153787,
         9.0: 116569,
         6.0: 107583,
         10.0: 96991,
         4.0: 25283,
         3.0: 13874,
         1.0: 9672,
         2.0: 8223,
         0.0: 242})

In [None]:
#establecer un indice ordenado a todos los usuarios para almacenar correctamente
#y buscar correctamente cada user/movie en su lugar correspondiente
def set_ids_by_order(column):
    #get unique ids of column (deleting possibles duplicates)
    ids = column.unique()
    #set to each userid a index
    order_ids_by_index = { userid:index for index,userid in enumerate(ids)}
    #set array to save all values of userid
    values_ids = np.array([order_ids_by_index[userid] for userid in column])
    return order_ids_by_index, values_ids, len(ids)

def convert_dataset(dataset):
    #get ids, column values and number of items
    user_ids, dataset['userid'], n_users = set_ids_by_order(dataset['userid'])
    item_ids, dataset['itemid'], n_items = set_ids_by_order(dataset['itemid'])
    #dic = {
    #    'dataset':dataset,
    #    'user_ids': user_ids,
    #    'item_ids': item_ids,
    #    'n_users': n_users,
    #    'n_items':  n_items,
    #}
    return dataset, user_ids, n_users, item_ids, n_items

In [None]:
#obtemos el dataset explotado
dataset, user_ids, n_users, item_ids, n_items = convert_dataset(train)

print("{} users and {} items (movies)".format(n_users, n_items))
dataset.head()

71259 users and 37704 items (movies)


Unnamed: 0,userid,itemid,rating
0,0,0,9.0
1,1,1,8.0
2,2,2,7.0
3,3,3,7.0
4,4,4,9.0


In [None]:
# creamos vectores factor/embeddings que son de espacios dimensionales inferiores
def create_vector(n_features,n_factors):
    # n_features contains the number of users/items(movies)
    # n_factors contains the number of factors to apply to the vector (n*f)

    # generate (initializate) the values of factor vectors
    return 11 * np.random.random((n_features,n_factors)) / n_factors

# creamos matrix escasa (sparse matrix) [[0,0...2.,0][0,3,0...1,3,0],...]
def generate_matrix(d, rows, cols, by_column="rating"):
    #create sparse matrix
    #ratings = d['dataset'][by_column].values
    #user_ids, item_ids = d['user_ids'].values, d['item_ids'].values
    return sparse.csc_matrix((d[by_column].values,(d['userid'].values, d['itemid'].values)), shape=(rows,cols))

In [None]:
Y = generate_matrix(dataset, n_users, n_items)
Y.todense()

matrix([[9., 0., 0., ..., 0., 0., 0.],
        [0., 8., 0., ..., 0., 0., 0.],
        [0., 0., 7., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#creamos una función para que ejecute las predicciones = r_xi = p_x * q_i
def estimate(dataset, P, Q):
    #P(users), P(items)
    #multiply both factor vectos and then sum each multiplication
    dataset['estimation'] = np.sum(np.multiply(Q[dataset['itemid']], P[dataset['userid']]),axis=1)
    return dataset

#creamos la funcion de coste
def rmse(dataset, P, Q):
    #P(users), P(items)
    user = item = 0
    Y = generate_matrix(dataset, P.shape[user], Q.shape[item])
    R = estimate(dataset, P, Q)
    Y_pred = generate_matrix(R, P.shape[user], Q.shape[item], 'estimation')
    mse = np.sum((Y-Y_pred).power(2))/dataset.shape[0]
    return np.sqrt(mse)

In [None]:
# crear la funciones para el descendiente de gradiente
def gradient(dataset, P, Q):
    #P(users), P(items)
    user = item = 0
    Y = generate_matrix(dataset, P.shape[user], Q.shape[item])
    R = estimate(dataset, P, Q)
    Y_pred = generate_matrix(R, P.shape[user], Q.shape[item], 'estimation')

    #compute gradient for V (user) with regularization
    U = (-2/dataset.shape[0])*((Y-Y_pred)*Q)+(2*lmbda*P)
    #compute gradient for V (item/films) with regularization
    V = (-2/dataset.shape[0])*((Y-Y_pred).T*P)+(2*lmbda*Q)
    return U,V

#crear la funcion para ejecutar el gradiente descendiente
def GD(dataset, P, Q, epochs=500, learning_rate=0.01):
    Y = generate_matrix(dataset, P.shape[0], Q.shape[0])
    # momentum value
    beta = 0.99
    #gradients
    U,V = gradient(dataset,P,Q)
    grad_value_user=U
    grad_value_item=V
    #implement momentum to reduce overfitting
    #https://ml-cheatsheet.readthedocs.io/en/latest/optimizers.html#momentum
    for i in range(epochs):
        U,V = gradient(dataset,P,Q)

        grad_value_user = beta*grad_value_user + (1-beta)*U
        grad_value_item =beta*grad_value_item +(1-beta)*V

        P = P - learning_rate*grad_value_user
        Q = Q - learning_rate*grad_value_item

        if(not (i+1)%10):
            print("Epoch",i+1,":")
            print(" Train - rmse:",rmse(dataset,P,Q))
    return P,Q

In [None]:
lmbda = 0.00001
#create factor/embedding vector to users
P = create_vector(n_users, n_factors=6)
Q = create_vector(n_items, n_factors=6)

P, Q = GD(dataset, P, Q, epochs=4000, learning_rate=0.8)

Epoch 10 :
 Train - rmse: 3.2242551137645514
Epoch 20 :
 Train - rmse: 3.1247832759827903
Epoch 30 :
 Train - rmse: 3.03876665922399
Epoch 40 :
 Train - rmse: 2.966500521081557
Epoch 50 :
 Train - rmse: 2.907384222495011
Epoch 60 :
 Train - rmse: 2.860081340073345
Epoch 70 :
 Train - rmse: 2.822742539263396
Epoch 80 :
 Train - rmse: 2.7932535033247943
Epoch 90 :
 Train - rmse: 2.76946953702207
Epoch 100 :
 Train - rmse: 2.7494077224066706
Epoch 110 :
 Train - rmse: 2.7313814193234935
Epoch 120 :
 Train - rmse: 2.714074989234975
Epoch 130 :
 Train - rmse: 2.6965657035068284
Epoch 140 :
 Train - rmse: 2.6783043391003405
Epoch 150 :
 Train - rmse: 2.6590670218980885
Epoch 160 :
 Train - rmse: 2.6388899147028515
Epoch 170 :
 Train - rmse: 2.6179965352946786
Epoch 180 :
 Train - rmse: 2.5967255128910023
Epoch 190 :
 Train - rmse: 2.575464727165933
Epoch 200 :
 Train - rmse: 2.5545960639882193
Epoch 210 :
 Train - rmse: 2.534453420697951
Epoch 220 :
 Train - rmse: 2.5152950677611328
Epoch 23

In [None]:
#explotar un nuevo conjunto de datos verificando
#que sea valida, es decir, en base al conjunto de
#usuarios e items(films) entrenados
def process_new_data(dataset, user_ids, items_ids):
    select_data = dataset['userid'].isin(user_ids.keys()) & dataset['itemid'].isin(items_ids.keys())
    dataset = dataset[select_data]
    dataset['userid'] = np.array([user_ids[userid] for userid in dataset['userid']])
    dataset['itemid'] = np.array([items_ids[itemid] for itemid in dataset['itemid']])
    return dataset

In [None]:
train_rmse = rmse(train, P, Q)
#val_rmse = rmse(valid, P, Q)
print(train_rmse)

1.784410977924797


In [None]:
train.head(20)

Unnamed: 0,userid,itemid,rating,estimation
0,0,0,9.0,7.73323
1,1,1,8.0,8.331486
2,2,2,7.0,8.626974
3,3,3,7.0,7.007077
4,4,4,9.0,3.996475
5,5,5,7.0,6.887957
6,6,6,6.0,6.259116
7,7,7,5.0,6.416419
8,8,8,10.0,7.599702
9,9,9,6.0,6.537161


In [None]:
#27609 361748
train.loc[(train['userid']==user_ids.get(8660)) & (train['itemid']==item_ids.get(2543472))]

Unnamed: 0,userid,itemid,rating,estimation
1,11335,1210,5.0,7.416649


In [None]:
def estimate_single(user, movie, P, Q):
    # Estimate the rating for a single user and movie
    return np.dot(Q[movie], P[user])

In [None]:
tests = pd.read_csv('/kaggle/input/movie-recommender-dataset/test.csv',sep='-')
tests['Expected'] = 0.0

tests['Id'] = tests['userid'].astype(str) + '-' + tests['itemid'].astype(str)
for index, row in tests.iterrows():
    user_id = int(row['userid'])
    movie_id = int(row['itemid'])
    tests.at[index, 'Expected'] = round(estimate_single(user_ids.get(user_id), item_ids.get(movie_id), P, Q), 4)

tests[['Id', 'Expected']].to_csv('resultados_estimados.csv', index=False)