## Ajánló rendszerek: Kollaboratív szűrő

In [33]:
import pandas as pd
import numpy as np

### u.user fájl betöltése

In [34]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### u.item fájl betöltése

In [35]:
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')

# Minden információ törlése kivéve movie_id és title
movies = movies[['movie_id', 'title']]

movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


### u.data fájl betöltése

In [36]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')

ratings = ratings.drop('timestamp', axis=1)

ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


### Train-teszt szétválasztás

In [37]:
from sklearn.model_selection import train_test_split

X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

### Függvény ami kiszámítja a gyökös eltérés-négyzetösszeget

In [38]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

### Az alap értékelést állítsuk 3-ra (kb. ez az átlag)

In [39]:
def baseline(user_id, movie_id):
    return 3.0

### Függvény ami kiszámítja az RMSE-t adott modellel a teszt halmazon

In [40]:
def score(cf_model):
    # User-Movie Tuple létrehozása
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    # Minden Tuple-hez értékelés predikció hozzárendelése
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    # Kivonatolni a valós értékeléseket
    y_true = np.array(X_test['rating'])
    
    # Végső RMSE visszatérítése
    return rmse(y_true, y_pred)


# kipróbálás alap modellel
print("Alap modell RMSE:", score(baseline))

Alap modell RMSE: 1.2470926188539486


### Felhasználó alapú kollaboratív szűrés ####################################

In [41]:
# Értékelési mátrix létrehozása
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

### Kollaboratív szűrés az átlagos értékelésekkel

In [42]:
def cf_user_mean(user_id, movie_id):
    
    # Megnézni, hogy a movie_id létezik-e a mátrixban
    if movie_id in r_matrix:
        # Filmre adott átlagos értékelések ellenőrzése
        mean_rating = r_matrix[movie_id].mean()
    
    else:
        # Alap értékelés 3-ra állítása
        mean_rating = 3.0
    
    return mean_rating


print('Felhasználói átlagok alapján: ', score(cf_user_mean))

Felhasználói átlagok alapján:  1.0234701463131335


### Kollaboratív szűrés súlyozott átlagokkal

In [43]:
# Dummy mátrix létrehozása nulla értékekkel a hiányzók helyén
r_matrix_dummy = r_matrix.copy().fillna(0)

### Koszinusz hasonlóság kiszámítása a dummy mátrixon

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)    

cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.118076,0.029097,0.011628,0.264677,0.312419,0.308729,0.224269,0.026017,0.286411,...,0.308475,0.055872,0.197862,0.131367,0.152449,0.084456,0.293293,0.056765,0.103536,0.326491
2,0.118076,1.0,0.099097,0.10768,0.034279,0.152789,0.086705,0.078864,0.06894,0.092399,...,0.086927,0.259636,0.289092,0.318824,0.149105,0.186347,0.168034,0.106748,0.136796,0.080358
3,0.029097,0.099097,1.0,0.252131,0.026893,0.062539,0.039767,0.089474,0.078162,0.03767,...,0.040918,0.019031,0.065417,0.055373,0.086503,0.018418,0.096993,0.109631,0.092574,0.018987
4,0.011628,0.10768,0.252131,1.0,0.0,0.045543,0.078812,0.095354,0.059498,0.053879,...,0.024226,0.050703,0.056561,0.107294,0.098892,0.0,0.1329,0.142798,0.097066,0.015176
5,0.264677,0.034279,0.026893,0.0,1.0,0.202843,0.299619,0.163724,0.038474,0.153021,...,0.262547,0.048524,0.048312,0.022202,0.09191,0.066,0.156172,0.115842,0.124297,0.267574
6,0.312419,0.152789,0.062539,0.045543,0.202843,1.0,0.375963,0.131795,0.110944,0.400758,...,0.287549,0.080312,0.162988,0.182856,0.114262,0.09209,0.261859,0.097606,0.206104,0.187637
7,0.308729,0.086705,0.039767,0.078812,0.299619,0.375963,1.0,0.211282,0.107795,0.328923,...,0.290002,0.07417,0.094619,0.084235,0.11562,0.100625,0.233843,0.039199,0.224227,0.296332
8,0.224269,0.078864,0.089474,0.095354,0.163724,0.131795,0.211282,1.0,0.03704,0.183375,...,0.165008,0.066843,0.058766,0.068759,0.087159,0.129381,0.188662,0.121223,0.08391,0.273238
9,0.026017,0.06894,0.078162,0.059498,0.038474,0.110944,0.107795,0.03704,1.0,0.155435,...,0.011708,0.0,0.10171,0.034568,0.045002,0.052699,0.107486,0.055766,0.070065,0.088281
10,0.286411,0.092399,0.03767,0.053879,0.153021,0.400758,0.328923,0.183375,0.155435,1.0,...,0.278558,0.04931,0.153506,0.065471,0.060088,0.033686,0.197107,0.085402,0.118945,0.162538


### Súlyozott felhasználói átlagok alapján kollaboratív szűrés

In [45]:
def cf_user_wmean(user_id, movie_id):
    wmean_rating = 3.0
    
    # Létezik-e a film a dummy mátrixban
    if movie_id in r_matrix:
        
        # A felhasználó és a többi felhasználó közötti koszinusz hasonlóság lekérése
        sim_scores = cosine_sim[user_id]
        
        # A film felhasználói értékelésének lekérése
        m_ratings = r_matrix[movie_id]
        
        # NaN indexek kiindexelése
        idx = m_ratings[m_ratings.isnull()].index
        
        # NaN értékek eldobása
        m_ratings = m_ratings.dropna()
        
        # A megfelelő koszinusz-hasonlósági pontok eldobása
        sim_scores = sim_scores.drop(idx)
        
        # Végső súlyozott átlag kiszámítása
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    return wmean_rating

print("Súlyozott átlag modell RMSE:", score(cf_user_wmean))

Súlyozott átlag modell RMSE: 1.0174483808407588


### Demográfiai megközelítés #################################################

In [46]:
# Az eredeti movies dataset összekapcsolása a felhasználókkal 
merged_df = pd.merge(X_train, users)

merged_df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,889,684,2,24,M,technician,78704
1,889,279,2,24,M,technician,78704
2,889,29,3,24,M,technician,78704
3,889,190,3,24,M,technician,78704
4,889,232,3,24,M,technician,78704


### Nem szerinti átlagos értékeléseket kiszámítani

In [47]:
gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

# A users táblában az indexet a user_id-ra állítani
users = users.set_index('user_id')

users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


### Nem alapú kollaboratív szűrés átlagos értékelések felhasználásával

In [49]:
def cf_gender(user_id, movie_id):
    
    # Létezik-e a film?
    if movie_id in r_matrix:
        # A felhasználó nemének beazonosítása
        gender = users.loc[user_id]['sex']
        
        # Az ő neme értékelte a filmet?
        if gender in gender_mean[movie_id]:
            
            # Az ő neme szerinti értékelések leszűrése
            gender_rating = gender_mean[movie_id][gender]
        
        else:
            gender_rating = 3.0
    
    else:
        # Az alap értékelés 3-ra állítása
        gender_rating = 3.0
    
    return gender_rating


print("Nem alapú kollaboratív szűrés:", score(cf_gender))

Nem alapú kollaboratív szűrés: 1.0330308800874282


### Demográfiai megközelítés foglalkozásokkal

In [51]:
# Átlagos értékelés nem és foglalkozás szerint
gen_occ_mean = merged_df[['sex', 'rating', 'movie_id', 'occupation']].pivot_table(
    values='rating', index='movie_id', columns=['occupation', 'sex'], aggfunc='mean')

gen_occ_mean.head()

occupation,administrator,administrator,artist,artist,doctor,educator,educator,engineer,engineer,entertainment,...,salesman,salesman,scientist,scientist,student,student,technician,technician,writer,writer
sex,F,M,F,M,M,F,M,F,M,F,...,F,M,F,M,F,M,F,M,F,M
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,4.222222,4.25,3.5,3.666667,3.5,3.923077,4.0,3.970588,5.0,...,4.0,4.0,3.5,3.888889,3.833333,3.709091,4.0,4.2,4.166667,3.142857
2,3.0,3.75,,,,,3.25,,3.363636,,...,,,,,2.333333,3.333333,,2.714286,5.0,2.666667
3,3.5,2.5,,,,4.0,2.5,,3.625,,...,,1.0,,,2.0,3.217391,,4.0,,1.0
4,3.0,3.888889,,4.666667,3.0,2.75,3.636364,,3.555556,,...,4.0,3.666667,,3.6,3.285714,3.724138,,3.2,4.25,3.5
5,4.0,2.333333,,,,4.0,1.5,,2.666667,,...,,,,3.5,4.333333,3.272727,,3.333333,4.0,2.666667


### Nem és foglalkozás alapú kollaboratív szűrés

In [53]:
def cf_gen_occ(user_id, movie_id):
    if movie_id in gen_occ_mean.index:
        user = users.loc[user_id]
        gender = user['sex']
        occ = user['occupation']
        
        # A foglalkozás értékelte a filmet?
        if occ in gen_occ_mean.loc[movie_id]:
            
            # A nem értékelte a filmet?
            if gender in gen_occ_mean.loc[movie_id][occ]:
                
                # Szükséges értékelés lekérése
                rating = gen_occ_mean.loc[movie_id][occ][gender]
                
                # 3 alapértékre állítás
                if np.isnan(rating):
                    rating = 3.0
                return rating
    return 3.0

print("Nem alapú kollaboratív szűrés:", score(cf_gen_occ))

Nem alapú kollaboratív szűrés: 1.1391976012043645


### Modellalapú megközelítés ################################################

In [54]:
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate

# A Reader objektum segít bejárni a string fájlokat, dataframeket
reader = Reader()

# A szűréshez szükséges dataset létrehozása
data = Dataset.load_from_df(ratings, reader)

# KNN objektum létrehozása
knn = KNNBasic()

# KNN kiértékelése
cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9856  0.9736  0.9826  0.9787  0.9764  0.9794  0.0043  
MAE (testset)     0.7774  0.7681  0.7759  0.7732  0.7732  0.7735  0.0032  
Fit time          0.47    0.54    0.50    0.53    0.51    0.51    0.03    
Test time         3.55    3.73    3.94    4.12    4.03    3.87    0.21    


{'test_rmse': array([0.98560552, 0.97363968, 0.9826491 , 0.97865998, 0.97643373]),
 'test_mae': array([0.77738166, 0.7680979 , 0.77585142, 0.77316436, 0.77322653]),
 'fit_time': (0.46999168395996094,
  0.5447742938995361,
  0.49503231048583984,
  0.5325360298156738,
  0.5107057094573975),
 'test_time': (3.553853988647461,
  3.726731777191162,
  3.9371209144592285,
  4.124756336212158,
  4.032431364059448)}

### Modellalapú megközelítés: SVD

In [55]:
#Import SVD
from surprise import SVD

#Define the SVD algorithm object
svd = SVD()

#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9431  0.9358  0.9399  0.9331  0.9379  0.9380  0.0034  
MAE (testset)     0.7422  0.7388  0.7393  0.7358  0.7393  0.7391  0.0020  
Fit time          5.36    5.63    5.28    5.09    6.31    5.53    0.42    
Test time         0.15    0.15    0.15    0.18    0.16    0.16    0.01    


{'test_rmse': array([0.94311107, 0.9358105 , 0.93993707, 0.93314043, 0.93789266]),
 'test_mae': array([0.7422006 , 0.73876513, 0.73928039, 0.73575606, 0.73929247]),
 'fit_time': (5.360079765319824,
  5.631258249282837,
  5.283236503601074,
  5.092748403549194,
  6.30579137802124),
 'test_time': (0.14712738990783691,
  0.14713096618652344,
  0.15366840362548828,
  0.184952974319458,
  0.16278314590454102)}