# Aula 09 - CARS - Exercícios

In [1]:
import pandas as pd
import numpy as np
import zipcodes

### Importar base de dados

In [2]:
import wget
!python3 -m wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!tar -xvzf ml-100k.zip


Saved under ml-100k (19).zip


x ml-100k/
x ml-100k/allbut.pl
x ml-100k/mku.sh
x ml-100k/README
x ml-100k/u.data
x ml-100k/u.genre
x ml-100k/u.info
x ml-100k/u.item
x ml-100k/u.occupation
x ml-100k/u.user
x ml-100k/u1.base
x ml-100k/u1.test
x ml-100k/u2.base
x ml-100k/u2.test
x ml-100k/u3.base
x ml-100k/u3.test
x ml-100k/u4.base
x ml-100k/u4.test
x ml-100k/u5.base
x ml-100k/u5.test
x ml-100k/ua.base
x ml-100k/ua.test
x ml-100k/ub.base
x ml-100k/ub.test


In [3]:
rating = pd.read_csv('./ml-100k/u.data', sep='\t', names=['userId', 'itemId', 'rating', 'timestamp'])
rating.drop('timestamp', axis=1, inplace=True)
rating.head()

Unnamed: 0,userId,itemId,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [4]:
#Load the Movies data
item = pd.read_csv('./ml-100k/u.item', sep="|", encoding='latin-1', header=None)
item.columns = ['itemId', 'title' ,'release','video release date', 'IMDb URL', 'unknown', 'Action', 
                'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item.drop(columns=['video release date', 'IMDb URL'], inplace=True)

# Get the release year of the movies
item['release'] = pd.to_datetime(item['release'], errors='coerce')  
item['release_year'] = item['release'].dt.year.astype('Int64')

# Add the column decade of the movies calculated using the release date
item['decade'] = (item['release_year'] // 10 * 10)
item['decade'] = item['decade'].apply(lambda x: f"{int(x % 100)}'s" if pd.notna(x) else pd.NA)
item['decade'] = item['decade'].fillna("Unknown")

item = item.melt(id_vars=['itemId', 'title', 'decade'], var_name='genre')
item = item[item.value == 1]
item.drop(columns=['value'], inplace=True)

item.head()

Unnamed: 0,itemId,title,decade,genre
1948,267,unknown,Unknown,unknown
3054,1373,Good Morning (1971),70's,unknown
3365,2,GoldenEye (1995),90's,Action
3367,4,Get Shorty (1995),90's,Action
3380,17,From Dusk Till Dawn (1996),90's,Action


In [5]:
#Load the User data
user = pd.read_csv('./ml-100k/u.user', sep='|', encoding='latin-1', header=None)
user.columns = ['userId', 'age', 'gender', 'occupation', 'zipcode']

# Define and add the age group, based on the user's age, to user df
bins = [0, 12, 17, 50, 100]  # Ranges for the age groups
labels = ['Criança', 'Adolescente', 'Adulto', 'Idoso']  # Corresponding labels
user['age_group'] = pd.cut(user['age'], bins=bins, labels=labels, right=True, include_lowest=True)

user.head()

Unnamed: 0,userId,age,gender,occupation,zipcode,age_group
0,1,24,M,technician,85711,Adulto
1,2,53,F,other,94043,Idoso
2,3,23,M,writer,32067,Adulto
3,4,24,M,technician,43537,Adulto
4,5,33,F,other,15213,Adulto


***Exercício 01:*** Na aula, vimos a implementação e a aplicação do Factorization Machines (FM) com dois tipos de contexto: gêneros dos filmes e profissão do usuário. Verifique o desempenho do RMSE quando mais informações de contexto são incorporadas:
- Década de lançamento do filme: classifique os filmes por década: 60's, 70's, 80's, 90's, etc. 
- Sexo do usuário: M ou F. 
- Faixa etária: classifique os usuários por faixa etária, por exemplo: criança (até 12 anos), adolescente (13 a 17 anos), adulto (18 a 50 anos) e idoso (acima de 51 anos). 
- CEP: utilizar a biblioteca https://www.pythonpool.com/uszipcode-python/ de acordo com o exemplo abaixo

### Obter informações do usuário a partir do CEP (https://www.pythonpool.com/uszipcode-python/)

In [6]:
# uszipcode presented an error, so i used zipcodes
# !pip install zipcodes

### Consultando o estado

In [7]:
# Function to get state based on zipcode
def get_state_from_zipcode(zipcode):
    # Ensure the zipcode is numeric and has 5 digits
    if not zipcode.isdigit() or len(zipcode) != 5:
        return "Invalid"
    
    # Get location data for the given zipcode
    result = zipcodes.matching(zipcode)
    
    # Check if there is a matching result and return the state, otherwise return 'Invalid'
    if result:
        return result[0]['state']
    else:
        return "Invalid"
    
# Example 
get_state_from_zipcode('87711')

'NM'

In [8]:
from math import sqrt

# Calculates the Root Mean Square Error (RMSE) between predicted ratings and real ratings
def rmse(preds, ratings):
    if len(preds) != len(ratings):
        return -1
    sum = 0
    for i in range(len(preds)):
        sum += pow(preds[i] - ratings[i], 2)
    
    return sqrt(sum / len(preds))

# Function to make a prediction using the factorization machine model
def predict(w0, w, v, x):
    pred = w0 
    # Loop through each feature (except the target) to calculate the linear part
    for i in range(len(x)-1):
        pred += w[x[i]]  # Add the individual weight for feature x[i]
        # Calculate the interaction terms between each pair of features
        for j in range(i+1, len(x)-1):
            pred += np.dot(v[x[i]], v[x[j]]) 
    
    # Ensure the prediction is within the bounds (0 to 5)
    if pred < 0:
        pred = 0.0
    if pred > 5:
        pred = 5.0
        
    return pred

# Function to train the Factorization Machine (FM)
def train_fm(train, n_attr, dim=5, reg=0.01, lr=0.001, decay=0.1, miter=15):
    w0 = 0  
    w = np.zeros(n_attr)  
    v = np.random.normal(0.0, 0.1, (n_attr, dim))  
    n_samples = len(train) 
    
    error = []  
    for t in range(miter):
        #print('Iter #', t)
        sq_error = 0  
        for s in range(n_samples):
            pred = predict(w0, w, v, train[s])  # Get prediction for sample 's'
            e_ui = pred - train[s][-1]  # Calculate prediction error 
            sq_error = sq_error + e_ui**2  # Accumulate squared error
            
            # Update global bias using gradient descent
            w0 = w0 - lr * (e_ui + reg * w0)
            
            # Update weights and interaction factors
            for i in range(len(train[s])-1):
                # Update weight for the current feature using gradient descent
                w[train[s][i]] = w[train[s][i]] - lr * (e_ui + reg * w[train[s][i]])
                
                # Update interaction factors for the current feature
                for f in range(dim):
                    sum_v = 0  
                    for j in range(len(train[s])-1):
                        if train[s][i] != train[s][j]:
                            sum_v += v[train[s][j]][f] 
                            
                    # Update the factor for the current feature and dimension 'f'
                    v[train[s][i]][f] = v[train[s][i]][f] - lr * (e_ui * sum_v + reg * v[train[s][i]][f])

        error.append(sqrt(sq_error / n_samples)) 
        lr = lr * (1. / (1. + decay * t))  # Adjust learning rate over time

    return w0, w, v, error 


In [9]:
# Start the mapping for the FM model, mapping the users
users_ext = np.sort(rating.userId.unique())
offset = 0
map_users = {user: idx + offset for idx, user in enumerate(users_ext)}
map_users

{np.int64(1): 0,
 np.int64(2): 1,
 np.int64(3): 2,
 np.int64(4): 3,
 np.int64(5): 4,
 np.int64(6): 5,
 np.int64(7): 6,
 np.int64(8): 7,
 np.int64(9): 8,
 np.int64(10): 9,
 np.int64(11): 10,
 np.int64(12): 11,
 np.int64(13): 12,
 np.int64(14): 13,
 np.int64(15): 14,
 np.int64(16): 15,
 np.int64(17): 16,
 np.int64(18): 17,
 np.int64(19): 18,
 np.int64(20): 19,
 np.int64(21): 20,
 np.int64(22): 21,
 np.int64(23): 22,
 np.int64(24): 23,
 np.int64(25): 24,
 np.int64(26): 25,
 np.int64(27): 26,
 np.int64(28): 27,
 np.int64(29): 28,
 np.int64(30): 29,
 np.int64(31): 30,
 np.int64(32): 31,
 np.int64(33): 32,
 np.int64(34): 33,
 np.int64(35): 34,
 np.int64(36): 35,
 np.int64(37): 36,
 np.int64(38): 37,
 np.int64(39): 38,
 np.int64(40): 39,
 np.int64(41): 40,
 np.int64(42): 41,
 np.int64(43): 42,
 np.int64(44): 43,
 np.int64(45): 44,
 np.int64(46): 45,
 np.int64(47): 46,
 np.int64(48): 47,
 np.int64(49): 48,
 np.int64(50): 49,
 np.int64(51): 50,
 np.int64(52): 51,
 np.int64(53): 52,
 np.int64(54

In [10]:
# Map the movies using the offset equal to the total of users
items_ext = np.sort(rating.itemId.unique())
offset += len(map_users)
map_items = {item: idx + offset for idx, item in enumerate(items_ext)}
map_items

{np.int64(1): 943,
 np.int64(2): 944,
 np.int64(3): 945,
 np.int64(4): 946,
 np.int64(5): 947,
 np.int64(6): 948,
 np.int64(7): 949,
 np.int64(8): 950,
 np.int64(9): 951,
 np.int64(10): 952,
 np.int64(11): 953,
 np.int64(12): 954,
 np.int64(13): 955,
 np.int64(14): 956,
 np.int64(15): 957,
 np.int64(16): 958,
 np.int64(17): 959,
 np.int64(18): 960,
 np.int64(19): 961,
 np.int64(20): 962,
 np.int64(21): 963,
 np.int64(22): 964,
 np.int64(23): 965,
 np.int64(24): 966,
 np.int64(25): 967,
 np.int64(26): 968,
 np.int64(27): 969,
 np.int64(28): 970,
 np.int64(29): 971,
 np.int64(30): 972,
 np.int64(31): 973,
 np.int64(32): 974,
 np.int64(33): 975,
 np.int64(34): 976,
 np.int64(35): 977,
 np.int64(36): 978,
 np.int64(37): 979,
 np.int64(38): 980,
 np.int64(39): 981,
 np.int64(40): 982,
 np.int64(41): 983,
 np.int64(42): 984,
 np.int64(43): 985,
 np.int64(44): 986,
 np.int64(45): 987,
 np.int64(46): 988,
 np.int64(47): 989,
 np.int64(48): 990,
 np.int64(49): 991,
 np.int64(50): 992,
 np.int64

In [11]:
# Map the movies genres
genres_ext = np.sort(item.genre.unique())
offset += len(map_items)
map_genres = {genre: idx + offset for idx, genre in enumerate(genres_ext)}
map_genres

{'Action': 2625,
 'Adventure': 2626,
 'Animation': 2627,
 "Children's": 2628,
 'Comedy': 2629,
 'Crime': 2630,
 'Documentary': 2631,
 'Drama': 2632,
 'Fantasy': 2633,
 'Film-Noir': 2634,
 'Horror': 2635,
 'Musical': 2636,
 'Mystery': 2637,
 'Romance': 2638,
 'Sci-Fi': 2639,
 'Thriller': 2640,
 'War': 2641,
 'Western': 2642,
 'unknown': 2643}

In [12]:
# Map the users occupations
occupation_ext = np.sort(user.occupation.unique())
offset += len(map_genres)
map_occupation = {occup: idx + offset for idx, occup in enumerate(occupation_ext)}
map_occupation

{'administrator': 2644,
 'artist': 2645,
 'doctor': 2646,
 'educator': 2647,
 'engineer': 2648,
 'entertainment': 2649,
 'executive': 2650,
 'healthcare': 2651,
 'homemaker': 2652,
 'lawyer': 2653,
 'librarian': 2654,
 'marketing': 2655,
 'none': 2656,
 'other': 2657,
 'programmer': 2658,
 'retired': 2659,
 'salesman': 2660,
 'scientist': 2661,
 'student': 2662,
 'technician': 2663,
 'writer': 2664}

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(rating, test_size=.2, random_state=2)

In [14]:
# Initialize an empty list to store training data
data_train = []

# Add the mapped values for each attribute to the list
for i, row in train.iterrows():
    occup = map_occupation[user.loc[(user.userId==row['userId']), 'occupation'].iloc[0]]
    genres = [map_genres[l] for l in item.loc[(item.itemId==row['itemId']), 'genre'].tolist()]
    attr = []
    attr.append(map_users[row['userId']])
    attr.append(map_items[row['itemId']])
    attr += genres
    attr.append(occup)
    attr.append(row['rating'])
    data_train.append(attr)
data_train  

[[369, 1865, 2632, 2664, np.int64(4)],
 [120, 1063, 2625, 2639, 2641, 2654, np.int64(2)],
 [654, 1158, 2629, 2638, 2651, np.int64(4)],
 [698, 1263, 2629, 2657, np.int64(3)],
 [151, 1682, 2632, 2638, 2647, np.int64(4)],
 [748, 1563, 2642, 2657, np.int64(3)],
 [659, 1096, 2629, 2662, np.int64(4)],
 [666, 1403, 2632, 2654, np.int64(4)],
 [379, 1112, 2629, 2632, 2638, 2648, np.int64(4)],
 [114, 1894, 2629, 2648, np.int64(5)],
 [714, 1070, 2625, 2640, 2663, np.int64(3)],
 [933, 1073, 2632, 2638, 2648, np.int64(4)],
 [434, 1240, 2625, 2639, 2640, 2648, np.int64(4)],
 [194, 2357, 2625, 2628, 2661, np.int64(1)],
 [662, 1683, 2632, 2640, 2657, np.int64(4)],
 [127, 1780, 2625, 2655, np.int64(5)],
 [416, 1982, 2629, 2638, 2657, np.int64(2)],
 [901, 1129, 2625, 2630, 2632, 2645, np.int64(3)],
 [864, 1041, 2627, 2628, 2636, 2645, np.int64(1)],
 [463, 1457, 2625, 2632, 2641, 2664, np.int64(5)],
 [153, 1426, 2634, 2637, 2662, np.int64(4)],
 [229, 1992, 2632, 2638, 2662, np.int64(3)],
 [345, 1513, 262

In [15]:
# Initialize an empty list to store testing data
data_test = []

# Add the mapped values for each attribute to the list
for i, row in test.iterrows():
    occup = map_occupation[user.loc[(user.userId==row['userId']), 'occupation'].iloc[0]]
    genres = [map_genres[l] for l in item.loc[(item.itemId==row['itemId']), 'genre'].tolist()]
    attr = []
    attr.append(map_users[row['userId']])
    attr.append(map_items[row['itemId']])
    attr += genres
    attr.append(occup)
    attr.append(row['rating'])
    data_test.append(attr)
data_test  

[[156, 1215, 2625, 2630, 2640, 2648, np.int64(5)],
 [404, 2007, 2631, 2641, 2651, np.int64(1)],
 [243, 1492, 2625, 2640, 2663, np.int64(1)],
 [377, 1710, 2626, 2628, 2662, np.int64(4)],
 [278, 1969, 2625, 2658, np.int64(4)],
 [918, 1053, 2629, 2638, 2657, np.int64(4)],
 [21, 992, 2625, 2626, 2638, 2639, 2641, 2664, np.int64(5)],
 [776, 1594, 2629, 2632, 2658, np.int64(5)],
 [767, 1347, 2625, 2626, 2637, 2644, np.int64(4)],
 [0, 952, 2632, 2641, 2663, np.int64(3)],
 [416, 1165, 2632, 2640, 2657, np.int64(5)],
 [496, 1347, 2625, 2626, 2637, 2662, np.int64(3)],
 [457, 1217, 2632, 2638, 2663, np.int64(5)],
 [239, 1295, 2625, 2635, 2639, 2647, np.int64(1)],
 [882, 1042, 2630, 2632, 2640, 2654, np.int64(4)],
 [233, 2272, 2632, 2659, np.int64(3)],
 [95, 2174, 2631, 2645, np.int64(5)],
 [915, 1510, 2625, 2638, 2640, 2648, np.int64(4)],
 [84, 2091, 2632, 2647, np.int64(3)],
 [223, 1686, 2632, 2641, 2647, np.int64(1)],
 [0, 1140, 2640, 2663, np.int64(5)],
 [620, 1093, 2626, 2628, 2629, 2662, np.

In [16]:
# Calculate the total of attributes
n_attributes = max(max(x) for x in data_train) + 1

In [17]:
# Train the FM model
w0, w, v, error = train_fm(data_train, n_attributes)

In [18]:
# Get the real and predicted ratings
ratings = []
preds = []
for sample in data_test:
    ratings.append(sample[-1])
    preds.append(predict(w0, w, v, sample))

In [19]:
# Calculate the RMSE of prediction using FM model
rmse(preds, ratings)

0.9705100814691208

In [20]:
# Map the decade of the movies 
decade_ext = np.sort(item.decade.unique())
offset += len(map_occupation)
map_decade = {decade: idx + offset for idx, decade in enumerate(decade_ext)}
map_decade

{"20's": 2665,
 "30's": 2666,
 "40's": 2667,
 "50's": 2668,
 "60's": 2669,
 "70's": 2670,
 "80's": 2671,
 "90's": 2672,
 'Unknown': 2673}

In [21]:
# Map the gender of the users
gender_ext = np.sort(user.gender.unique())
offset += len(map_decade)
map_gender = {gender: idx+offset for idx, gender in enumerate(gender_ext)}
map_gender

{'F': 2674, 'M': 2675}

In [22]:
# Map the age group of the users
age_group_ext = np.sort(user.age_group.unique())
offset += len(map_gender)
map_age_group = {age_group: idx+offset for idx, age_group in enumerate(age_group_ext)}
map_age_group

{'Adolescente': 2676, 'Adulto': 2677, 'Criança': 2678, 'Idoso': 2679}

In [23]:
# Add the state of the users in the user df
user['state'] = user['zipcode'].apply(lambda x: get_state_from_zipcode(str(x)))

# Map the state of the users
state_ext = np.sort(user.state.unique())
offset += len(map_age_group)
map_state = {state: idx+offset for idx, state in enumerate(state_ext)}
map_state

{'AE': 2680,
 'AK': 2681,
 'AL': 2682,
 'AP': 2683,
 'AR': 2684,
 'AZ': 2685,
 'CA': 2686,
 'CO': 2687,
 'CT': 2688,
 'DC': 2689,
 'DE': 2690,
 'FL': 2691,
 'GA': 2692,
 'HI': 2693,
 'IA': 2694,
 'ID': 2695,
 'IL': 2696,
 'IN': 2697,
 'Invalid': 2698,
 'KS': 2699,
 'KY': 2700,
 'LA': 2701,
 'MA': 2702,
 'MD': 2703,
 'ME': 2704,
 'MI': 2705,
 'MN': 2706,
 'MO': 2707,
 'MS': 2708,
 'MT': 2709,
 'NC': 2710,
 'ND': 2711,
 'NE': 2712,
 'NH': 2713,
 'NJ': 2714,
 'NM': 2715,
 'NV': 2716,
 'NY': 2717,
 'OH': 2718,
 'OK': 2719,
 'OR': 2720,
 'PA': 2721,
 'RI': 2722,
 'SC': 2723,
 'SD': 2724,
 'TN': 2725,
 'TX': 2726,
 'UT': 2727,
 'VA': 2728,
 'VT': 2729,
 'WA': 2730,
 'WI': 2731,
 'WV': 2732,
 'WY': 2733}

In [24]:
# Initialize an empty list to store the new training data
data_train2 = []

# Add the mapped values for each attribute, including the new ones, to the list
for i, row in train.iterrows():
    occup = map_occupation[user.loc[(user.userId==row['userId']), 'occupation'].iloc[0]]
    genres = [map_genres[l] for l in item.loc[(item.itemId==row['itemId']), 'genre'].tolist()]
    decade = map_decade[item.loc[(item.itemId==row['itemId']), 'decade'].iloc[0]]
    gender = map_gender[user.loc[(user.userId==row['userId']), 'gender'].iloc[0]]
    age_group = map_age_group[user.loc[(user.userId==row['userId']), 'age_group'].iloc[0]]
    state = map_state[user.loc[(user.userId==row['userId']), 'state'].iloc[0]]
    attr = []
    attr.append(map_users[row['userId']])
    attr.append(map_items[row['itemId']])
    attr += genres
    attr.append(occup)
    attr.append(decade)
    attr.append(gender)
    attr.append(age_group)
    attr.append(state)
    attr.append(row['rating'])
    data_train2.append(attr)
data_train2 

[[369, 1865, 2632, 2664, 2672, 2675, 2679, 2714, np.int64(4)],
 [120, 1063, 2625, 2639, 2641, 2654, 2672, 2675, 2679, 2681, np.int64(2)],
 [654, 1158, 2629, 2638, 2651, 2671, 2674, 2677, 2696, np.int64(4)],
 [698, 1263, 2629, 2657, 2672, 2675, 2677, 2693, np.int64(3)],
 [151, 1682, 2632, 2638, 2647, 2672, 2674, 2677, 2712, np.int64(4)],
 [748, 1563, 2642, 2657, 2668, 2675, 2677, 2687, np.int64(3)],
 [659, 1096, 2629, 2662, 2670, 2675, 2677, 2726, np.int64(4)],
 [666, 1403, 2632, 2654, 2672, 2675, 2677, 2702, np.int64(4)],
 [379, 1112, 2629, 2632, 2638, 2648, 2671, 2675, 2677, 2706, np.int64(4)],
 [114, 1894, 2629, 2648, 2672, 2675, 2677, 2721, np.int64(5)],
 [714, 1070, 2625, 2640, 2663, 2672, 2675, 2677, 2686, np.int64(3)],
 [933, 1073, 2632, 2638, 2648, 2669, 2675, 2679, 2728, np.int64(4)],
 [434, 1240, 2625, 2639, 2640, 2648, 2672, 2675, 2677, 2696, np.int64(4)],
 [194, 2357, 2625, 2628, 2661, 2672, 2675, 2677, 2686, np.int64(1)],
 [662, 1683, 2632, 2640, 2657, 2672, 2675, 2677, 272

In [25]:
# Initialize an empty list to store the new testing data
data_test2 = []

# Add the mapped values for each attribute, including the new ones, to the list
for i, row in train.iterrows():
    occup = map_occupation[user.loc[(user.userId==row['userId']), 'occupation'].iloc[0]]
    genres = [map_genres[l] for l in item.loc[(item.itemId==row['itemId']), 'genre'].tolist()]
    decade = map_decade[item.loc[(item.itemId==row['itemId']), 'decade'].iloc[0]]
    gender = map_gender[user.loc[(user.userId==row['userId']), 'gender'].iloc[0]]
    age_group = map_age_group[user.loc[(user.userId==row['userId']), 'age_group'].iloc[0]]
    state = map_state[user.loc[(user.userId==row['userId']), 'state'].iloc[0]]
    attr = []
    attr.append(map_users[row['userId']])
    attr.append(map_items[row['itemId']])
    attr += genres
    attr.append(occup)
    attr.append(decade)
    attr.append(gender)
    attr.append(age_group)
    attr.append(state)
    attr.append(row['rating'])
    data_test2.append(attr)
data_test2 

[[369, 1865, 2632, 2664, 2672, 2675, 2679, 2714, np.int64(4)],
 [120, 1063, 2625, 2639, 2641, 2654, 2672, 2675, 2679, 2681, np.int64(2)],
 [654, 1158, 2629, 2638, 2651, 2671, 2674, 2677, 2696, np.int64(4)],
 [698, 1263, 2629, 2657, 2672, 2675, 2677, 2693, np.int64(3)],
 [151, 1682, 2632, 2638, 2647, 2672, 2674, 2677, 2712, np.int64(4)],
 [748, 1563, 2642, 2657, 2668, 2675, 2677, 2687, np.int64(3)],
 [659, 1096, 2629, 2662, 2670, 2675, 2677, 2726, np.int64(4)],
 [666, 1403, 2632, 2654, 2672, 2675, 2677, 2702, np.int64(4)],
 [379, 1112, 2629, 2632, 2638, 2648, 2671, 2675, 2677, 2706, np.int64(4)],
 [114, 1894, 2629, 2648, 2672, 2675, 2677, 2721, np.int64(5)],
 [714, 1070, 2625, 2640, 2663, 2672, 2675, 2677, 2686, np.int64(3)],
 [933, 1073, 2632, 2638, 2648, 2669, 2675, 2679, 2728, np.int64(4)],
 [434, 1240, 2625, 2639, 2640, 2648, 2672, 2675, 2677, 2696, np.int64(4)],
 [194, 2357, 2625, 2628, 2661, 2672, 2675, 2677, 2686, np.int64(1)],
 [662, 1683, 2632, 2640, 2657, 2672, 2675, 2677, 272

In [26]:
# Calculate the total of attributes, including the new ones
n_attributes2 = max(max(x) for x in data_train2) + 1

In [27]:
# Train the FM model with the new attributes
w0, w, v, error = train_fm(data_train2, n_attributes2)

In [28]:
ratings = []
preds = []
for sample in data_test2:
    ratings.append(sample[-1])
    preds.append(predict(w0, w, v, sample))

In [29]:
# Calculate the RMSW, using the new training and test data
rmse(preds, ratings)

0.9285119929713463

***Exercício 02:*** Implemente a abordagem KNN do tipo *pré-filtragem*. A aplicação do contexto será feita antes de se calcular os usuários ou itens mais similares. Por exemplo, ao prever a nota de um item *i* para um usuário que é médico, selecionam-se inicialmente todos usuários com a mesma ocupação. Depois calcula-se a similaridade deles, e por fim, a predição da nota.

A escolha de contextos (relaciondos com usuários, itens ou ambos) é livre.

In [30]:
# TODO

from caserec.recommenders.rating_prediction.itemknn import ItemKNN

# Merge the ratings with the user df, adding the user's occupation to the ratings df
ratings_with_user_occupation = pd.merge(rating, user, on='userId')
ratings_with_user_occupation = ratings_with_user_occupation[['userId', 'itemId', 'rating', 'occupation']]

def predict_rating_with_pre_filtering(user_id, movie_id):
    # Pre-filter users with the same occupation
    user_occupation = user.loc[user['userId'] == user_id, 'occupation'].values[0]
    
    # Filter ratings from users with the same occupation
    filtered_ratings = ratings_with_user_occupation[ratings_with_user_occupation['occupation'] == user_occupation]
    filtered_ratings = filtered_ratings[['userId', 'itemId', 'rating']]

    # Split the filtered ratings into training and testing sets
    train_new, test_new = train_test_split(filtered_ratings, test_size=0.2, random_state=2)
    train_new.to_csv('train_new.dat', index=False, header=False, sep='\t')
    test_new.to_csv('test_new.dat', index=False, header=False, sep='\t')

    # Execute the ItemKNN recommender
    recommender = ItemKNN('train_new.dat', 'test_new.dat', 'rp_iknn.dat', as_similar_first=True)
    recommender.compute(verbose=False, verbose_evaluation=False)

    # Retrieve the prediction for the desired movie and user
    predictions = pd.read_csv('rp_iknn.dat', sep='\t', header=None, names=['userId', 'itemId', 'rating'])
    
    # Filter the prediction for the specific movie and user
    predicted_rating = predictions[(predictions['userId'] == user_id) & (predictions['itemId'] == movie_id)]

    if not predicted_rating.empty:
        return predicted_rating['rating'].values[0], predictions, test_new
    else:
        return np.nan

# Example usage
user_id = 1
movie_id = 182

pred_rating, preds, testnew = predict_rating_with_pre_filtering(user_id, movie_id)
real_rating = rating[(rating['userId'] == user_id) & (rating['itemId'] == movie_id)]['rating'].values[0]

print(f"\nThe real rating from user {user_id} for movie {movie_id} is: {real_rating}")
print(f"\nThe predicted rating from user {user_id} for movie {movie_id} is: {pred_rating}")
print(f"\nRMSE: {rmse(preds[preds['userId'] == user_id]['rating'].tolist(), testnew[testnew.userId == user_id].sort_values(by = 'itemId')['rating'].tolist())}")


The real rating from user 1 for movie 182 is: 4

The predicted rating from user 1 for movie 182 is: 4.162582

RMSE: 1.1426296572591477


***Exercício 03:*** Implemente a abordagem KNN do tipo *pós-filtragem*. A aplicação do contexto será feita após serem calculados os usuários ou itens mais similares e a predição de notas. A pós-filtragem consiste no ajuste da predição de notas de acordo com o contexto. Pode-se usar uma estratégia do tipo nota_predita_contexto = nota_predita * fator_ajuste, em que fator_ajuste é a similaridade entre o contexto do usuário-alvo e de outros usuários que avaliaram o item.

Por exemplo, ao prever a nota de um item *i* para um usuário que é médico, calcula-se a similaridade de todos os usuários inicialmente, depois, a predição da nota. Em seguida, a predição da nota pode ser ajustada como: nota_predita_medico = nota_predita * similaridade('medico', profissoes_item), em que profissoes_item é o conjunto de todas as profissões dos usuários que avaliaram o item *i*, e similaridade pode ser Jaccard.  

A escolha de contextos (relaciondos com usuários, itens ou ambos) é livre.

In [32]:
# Merge the ratings with the user dataframe, adding the user's occupation to the ratings dataframe
ratings_with_user_occupation = pd.merge(rating, user, on='userId')
ratings_with_user_occupation = ratings_with_user_occupation[['userId', 'itemId', 'rating', 'occupation']]

# Calculate the Jaccard similarity between two sets
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    return intersection / union if union != 0 else 0

def predict_rating_with_post_filtering(user_id, movie_id):
    # Pre-filter users with the same occupation
    user_occupation = user.loc[user['userId'] == user_id, 'occupation'].values[0]

    # Filter ratings from all users
    filtered_ratings = ratings_with_user_occupation[['userId', 'itemId', 'rating']]
    
    # Split the filtered ratings into training and testing sets
    train_new, test_new = train_test_split(filtered_ratings, test_size=0.2, random_state=2)
    train_new.to_csv('train_new.dat', index=False, header=False, sep='\t')
    test_new.to_csv('test_new.dat', index=False, header=False, sep='\t')

    # Execute the ItemKNN recommender
    recommender = ItemKNN('train_new.dat', 'test_new.dat', 'rp_iknn.dat', as_similar_first=True)
    recommender.compute(verbose=False, verbose_evaluation=False)

    # Retrieve the prediction for the desired movie and user
    predictions = pd.read_csv('rp_iknn.dat', sep='\t', header=None, names=['userId', 'itemId', 'rating'])
    
    # Filter the prediction for the specific movie and user
    predicted_rating = predictions[(predictions['userId'] == user_id) & (predictions['itemId'] == movie_id)]

    if not predicted_rating.empty:
        # Get the predicted rating
        predicted_value = predicted_rating['rating'].values[0]

        # Get the occupations of users who rated the movie
        user_occupations = ratings_with_user_occupation[ratings_with_user_occupation['itemId'] == movie_id]['occupation'].unique()

        # Calculate the Jaccard similarity between the target user’s occupation and the occupations of users who rated the movie
        occupation_set_target = {user_occupation}
        occupation_set_item_users = set(user_occupations)
        
        similarity_factor = jaccard_similarity(occupation_set_target, occupation_set_item_users)

        # Adjust the predicted rating based on the similarity factor
        adjusted_predicted_rating = predicted_value * similarity_factor

        return adjusted_predicted_rating
    else:
        return np.nan

# Example usage
user_id = 1
movie_id = 182

pred_rating_adjusted = predict_rating_with_post_filtering(user_id, movie_id)
real_rating = rating[(rating['userId'] == user_id) & (rating['itemId'] == movie_id)]['rating'].values[0]

print(f"\nThe real rating from user {user_id} for movie {movie_id} is: {real_rating}")
print(f"\nThe adjusted predicted rating from user {user_id} for movie {movie_id} is: {pred_rating_adjusted}")


The real rating from user 1 for movie 182 is: 4

The adjusted predicted rating from user 1 for movie 182 is: 0.20293145
