### Importar base de dados e instalar framework de recomendação

### Importar bibliotecas

In [None]:
import wget
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
!tar -xvzf ml-20m-compact.tar.gz

In [None]:
import sys
!{sys.executable} -m pip install caserecommender

In [None]:
import pandas as pd
import numpy as np
import math

### Explorar Dados

In [None]:
movies = pd.read_csv('./dataset/movies_sample.csv')
movies.tail()

In [None]:
ratings = pd.read_csv('./dataset/ratings_sample.csv')
ratings.head()

In [None]:
import matplotlib.pyplot as plt
ratings.rating.value_counts().plot(kind='bar', color=['r', 'g', 'y', 'c', 'b']);
plt.show()

In [None]:
df = ratings[['userId', 'movieId', 'rating']]
df.tail()

In [None]:
df = df.merge(movies[['movieId', 'title']])
df.head()

### Números de usuários e número de itens

In [None]:
print(
"""
Número de usuários: {}
Número de itens: {}
Número de interações: {}
""".format(
    df.userId.nunique(),
    df.movieId.nunique(),
    df.shape[0]
)
)

### Mapeamento em idx

In [None]:
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.movieId.unique())}

In [None]:
df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)
df.head()

In [None]:
map_title = {}

for _, row in df.iterrows():
    map_title[row.movieId] = row.title

map_title

In [None]:
print(len(map_title))
print(map_title[100])

In [None]:
# qtd interações dos usuários
df.groupby('userId').count()

### Divisão do dataset

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2, random_state=2)
train.to_csv('train.txt', index=False, header=False, sep='\t')
test.to_csv('test.txt', index=False, header=False, sep='\t')

In [None]:
! ls -l

In [None]:
print('No. items: ' + str(train['movieId'].nunique()))
print('No. users: ' + str(train['userId'].nunique()))

### Recomendadores

## Prever notas

In [None]:
from caserec.recommenders.rating_prediction.most_popular import MostPopular

In [None]:
MostPopular('train.txt', 'test.txt', 'out_mp_pred.txt').compute()

In [None]:
df_pred = pd.read_csv('out_mp_pred.txt', sep='\t', names=['userId', 'movieId', 'pred'])
df_pred.head(10)

In [None]:
test[test.userId == 1]

In [None]:
df_pred = df_pred.merge(test)

In [None]:
df_pred.rating.value_counts().plot(kind='bar')
plt.show()

In [None]:
df_pred['pred'] = round(df_pred['pred']*2)/2
df_pred.pred.value_counts().plot(kind='bar');
plt.show()

### Top N

In [None]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular as MPR

In [None]:
MPR('train.txt', 'test.txt', 'out_mp_pred_binary.txt', as_binary=True).compute()

In [None]:
ranking = pd.read_csv('out_mp_pred_binary.txt', sep='\t', names=['userId', 'movieId', 'score'])
ranking['title'] = ranking.movieId.map(map_title)
ranking.head(15)

### Item KNN

In [None]:
from caserec.recommenders.rating_prediction.itemknn import ItemKNN

ItemKNN('train.txt', 'test.txt', 'rp_iknn.txt', as_similar_first=True).compute()

## Exercises

In [None]:
# Answers. A dictionary created to hold the answers of the questions.
answers : dict[str, any] = dict()

# Auxiliar. A dictionary created to hold auxiliar variables, used to answer the questions.
aux : dict[str, any] = dict()

**Exercício 01:**

a) Encontrar o id do usuário que avaliou mais filmes na base. Informe o id externo (da base dados) e o interno (usado pelo recomendador)

In [None]:
answers['01-a'] = {'external-id': ratings['userId'].mode()[0]}
answers['01-a']['internal-id'] = map_users[answers['01-a']['external-id']]
    
print(f'01.a) External ID: {answers["01-a"]["external-id"]}')
print(f'01.a) Internal ID: {answers["01-a"]["internal-id"]}')

b) Encontrar os 5 primeiros usuários que têm maior propensão a dar notas baixas aos filmes (use a média de ratings para isso).

**Note: using only internal id from here onwards.**

In [None]:
answers['01-b'] = (ratings
        .groupby('userId')['rating'] # Grouping by user and selecting the rating column
        .mean() # Calculating the mean
        .sort_values(ascending=True)[0:5] # Sorting and getting the first 5
        .index # Getting the ids
        .tolist() # Transforming to list
)

print(f'01.b) {answers["01-b"]}')

c) Encontrar os 5 filmes pior avaliados pelos usuários.

In [None]:
answers['01-c'] = (ratings
        .groupby('movieId')['rating'] # Grouping by user and selecting the rating column
        .mean() # Calculating the mean
        .sort_values(ascending=True)[0:5] # Sorting and getting the first 5
        .index # Getting the ids
        .tolist() # Transforming to list
)

print(f'01.c) {answers["01-c"]}')

**Exercício 02:** Na aula vimos uma abordagem de recomendação não personalizada que utiliza a quantidade de interações dos itens para recomendar filmes mais populares aos usuários. Quando há feedback explícito, entretanto, a mesma abordagem "Most Popular" pode fazer uso da média das notas disponíveis, ou seja, recomendam-se os N filmes mais bem avaliados a cada usuário. Desse modo, calcule a recomendação de 5 filmes para um usuário qualquer da base considerando essa estratégia não personalizada. Utilize a base toda como conjunto de treinamento.

In [None]:
# Creating the dataframe with the mean of the ratings
aux['02-df'] = df.groupby('movieId')['rating'].mean()

aux['02-df'] = pd.DataFrame({
    'movieId': aux['02-df'].index,
    'ratings': aux['02-df'].values
}).sort_values(by='ratings', ascending=False)

aux['02-df']['title'] = aux['02-df']['movieId'].map(map_title)

aux['02-df'].head(5)

**Exercício 03:** Com base no código disponível no notebook "Aula01_Exemplos.ipynb", implemente uma função que retorna os k filmes mais similares a um outro qualquer passado como parâmetro. 

In [None]:
# Gives a list of all users who have rated a particular item.
def get_users_ids(movieId : int) -> list[int]:
    if movieId not in df['movieId'].values:
        return []
    return df[df['movieId'] == movieId]['userId'].tolist()

# Example
get_users_ids(1)[:5]

In [None]:
def get_movie_mean(movieId : int) -> float:
    return df[df['movieId'] == movieId]['rating'].mean()

# Example
get_movie_mean(1)

In [None]:
# Get the rating of a user for a specific item
def get_rating(userId : int, itemId : int) -> float:
    __ratings = df[(df['userId'] == userId) & (df['movieId'] == itemId)]
    return 0 if __ratings.empty else __ratings['rating'].values[0]

# Example
get_rating(0, 1)

In [None]:
def movie_sim_score(movieId1 : int, movieId2 : int) -> float:
    '''
    userId1 & userId2 : ids dos dois usuários cuja similaridade será computada
    '''

    # First, we look for the users who have rated both items.
    user_list1 = get_users_ids(movieId1) # who rated movieId1?
    user_list2 = get_users_ids(movieId2) # who rated movieId2?

    common_users = list(set(user_list1) & set(user_list2)) # who rated both?

    # TRIVIAL CASE:
    #  If no one rated both items, the similarity is 0.
    if len(common_users) == 0: return 0
    
    # HARD CASE: 
    #  if there are common items, we continue the calculation.
    
    # Now, we calculate the mean rating for each item.
    movie1_mean = get_movie_mean(movieId1)
    movie2_mean = get_movie_mean(movieId2)
    
    # Now, e calculate the similarity.
    sim : list[float] = [] # list to hold the similarity values
    norm1 = [] # list to hold the norm values for item 1
    norm2 = [] # list to hold the norm values for item 2
    for user in common_users: # iterate over the common items
        # Get the rating for this user for each item
        rating1 = get_rating(user, movieId1)
        rating2 = get_rating(user, movieId2)

        # Calculate a similarity value for this user and append it to the list
        sim.append(  (rating1 - movie1_mean) * (rating2 - movie2_mean)  )

        # Calculate the norm values for each item
        norm1.append(  math.pow(rating1 - movie1_mean, 2)  )
        norm2.append(  math.pow(rating2 - movie2_mean, 2)  )
    
    
    try:
        # Calculate the final similarity score and return it
        a = float(sum(sim))
        b = float(math.sqrt(sum(norm1)) * math.sqrt(sum(norm2)))
        return a / b
    except:
        return float('nan')

movie_sim_score(0, 4)

In [None]:
def most_similar_movies(movieId : int, k : int | None = None) -> list[tuple[float, int]]:
    '''
    userId : Targeted User
    k : qtde de vizinhos
    '''

    # Get a list of all movie ids exept the target movie
    movies_ids = df[df['movieId'] != movieId]['movieId'].unique()
    
    # Getting a list of similarity scores for each movie
    # Note: Here we store the tuples (similarity, movieId)
    sim = [(movie_sim_score(movieId, m), m) for m in movies_ids]
    
    # Sorting the list of similarities in descending order
    sim.sort(reverse=True)
    
    # Returning the k most similar items
    return sim if k is None else sim[:k]



# Example
most_similar_movies(0, 3)

In [None]:
#saving the df userID, movieID and rating to a csv
df[['userId', 'movieId', 'rating']].to_csv('ratings-exercicios.csv', index=False)

In [None]:
movie_sim_score(0, 4)

In [None]:
# Getting a matrix of similarityes of every user
import numpy as np

# Assuming df contains the movie IDs and similarity scores
movie_ids = df['movieId'].unique().tolist()
movie_ids.sort()
num_movies = len(movie_ids)

# Initialize an empty similarity matrix
similarity_matrix = np.zeros((num_movies, num_movies))

In [42]:
# Calculate the similarity between all pairs of movies
for i in range(num_movies):
    for j in range(i):
        if i != j:
            similarity_matrix[i, j] = movie_sim_score(movie_ids[i], movie_ids[j])

# The resulting similarity_matrix will have the similarities between all movies
similarity_matrix

In [None]:
similarity_matrix_v2 = [
    [movie_sim_score(i, j) for j in i]
    for i in movie_ids
]

similarity_matrix_v2