## Ajánló rendszerek: Kollaboratív szűrő

In [1]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp311-cp311-macosx_10_9_x86_64.whl size=1116819 sha256=aa0720486aa8f33fb0f73110a7c0675a8f3ef0bb07f72f605c758ebaacc98cdb
  Stored in directory: /private/var/folders/0d/rp2vrncs0jbc1bcqpkxlbyzm0000gq/T/pip-ephem-wheel-cache-86v_t549/wheels/f4/2b/26/e2a5eae55d3b7688995e66abe7f40473aac6c95ddd8ee174a8
Successfully built scikit-surprise
Installing collected packages: scikit

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### u.user fájl betöltése

In [3]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### u.item fájl betöltése

In [4]:
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')

# Minden információ törlése kivéve movie_id és title
movies = movies[['movie_id', 'title']]

movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


### u.data fájl betöltése

In [5]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')

ratings = ratings.drop('timestamp', axis=1)

ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


### Train-teszt szétválasztás

In [6]:
from sklearn.model_selection import train_test_split

X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

### Függvény ami kiszámítja a gyökös eltérés-négyzetösszeget

In [7]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    y_true[np.isnan(y_true)] = 0
    y_pred[np.isnan(y_pred)] = 0
    return np.sqrt(mean_squared_error(y_true, y_pred))

### Az alap értékelést állítsuk 3-ra (kb. ez az átlag)

In [8]:
def baseline(user_id, movie_id):
    return 3.0

### Függvény ami kiszámítja az RMSE-t adott modellel a teszt halmazon

In [9]:
def score(cf_model):
    # User-Movie Tuple létrehozása
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    # Minden Tuple-hez értékelés predikció hozzárendelése
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    # Kivonatolni a valós értékeléseket
    y_true = np.array(X_test['rating'])
    
    # Végső RMSE visszatérítése
    return rmse(y_true, y_pred)


# kipróbálás alap modellel
print("Alap modell RMSE:", score(baseline))

Alap modell RMSE: 1.2488234462885457


### Felhasználó alapú kollaboratív szűrés ####################################

In [10]:
# Értékelési mátrix létrehozása
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

### Kollaboratív szűrés az átlagos értékelésekkel

In [11]:
def cf_user_mean(user_id, movie_id):
    
    # Megnézni, hogy a movie_id létezik-e a mátrixban
    if movie_id in r_matrix:
        # Filmre adott átlagos értékelések ellenőrzése
        mean_rating = r_matrix[movie_id].mean()
    
    else:
        # Alap értékelés 3-ra állítása
        mean_rating = 3.0
    
    return mean_rating


print('Felhasználói átlagok alapján: ', score(cf_user_mean))

Felhasználói átlagok alapján:  1.0300824802393536


### Kollaboratív szűrés súlyozott átlagokkal

In [12]:
# Dummy mátrix létrehozása nulla értékekkel a hiányzók helyén
r_matrix_dummy = r_matrix.copy().fillna(0)

### Koszinusz hasonlóság kiszámítása a dummy mátrixon

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)    

cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.108361,0.046638,0.029577,0.245753,0.335853,0.344724,0.191582,0.057149,0.251979,...,0.257073,0.069412,0.231643,0.108093,0.176842,0.104799,0.232472,0.051528,0.129555,0.256333
2,0.108361,1.0,0.057613,0.130237,0.054918,0.190552,0.079399,0.076146,0.167992,0.147376,...,0.136993,0.252887,0.255454,0.285193,0.232751,0.149088,0.102807,0.062386,0.109143,0.107686
3,0.046638,0.057613,1.0,0.139805,0.0,0.032485,0.043869,0.080968,0.022263,0.059925,...,0.027402,0.0,0.17506,0.010343,0.105635,0.019052,0.127099,0.023917,0.060392,0.0
4,0.029577,0.130237,0.139805,1.0,0.0,0.04519,0.088586,0.199526,0.135013,0.026919,...,0.055392,0.049773,0.076549,0.139382,0.113886,0.0,0.130343,0.077357,0.15789,0.063911
5,0.245753,0.054918,0.0,0.0,1.0,0.176443,0.28186,0.132205,0.03879,0.1342,...,0.183969,0.019305,0.073714,0.041807,0.081088,0.029743,0.188392,0.068342,0.055557,0.207259
6,0.335853,0.190552,0.032485,0.04519,0.176443,1.0,0.394725,0.143385,0.125126,0.372679,...,0.328643,0.070809,0.135806,0.17167,0.125446,0.086464,0.230566,0.095478,0.197307,0.185268
7,0.344724,0.079399,0.043869,0.088586,0.28186,0.394725,1.0,0.215861,0.121224,0.378723,...,0.339853,0.110866,0.096055,0.10469,0.126108,0.075012,0.270071,0.020036,0.236086,0.266571
8,0.191582,0.076146,0.080968,0.199526,0.132205,0.143385,0.215861,1.0,0.116173,0.169088,...,0.150048,0.064242,0.118297,0.053969,0.168057,0.095736,0.164157,0.076269,0.089871,0.210995
9,0.057149,0.167992,0.022263,0.135013,0.03879,0.125126,0.121224,0.116173,1.0,0.152694,...,0.082819,0.0644,0.127051,0.069251,0.095673,0.0,0.131458,0.106763,0.089297,0.089583
10,0.251979,0.147376,0.059925,0.026919,0.1342,0.372679,0.378723,0.169088,0.152694,1.0,...,0.279849,0.087828,0.131888,0.111841,0.094423,0.080883,0.255758,0.063461,0.169309,0.181031


### Súlyozott felhasználói átlagok alapján kollaboratív szűrés

In [14]:
def cf_user_wmean(user_id, movie_id):
    wmean_rating = 3.0
    
    # Létezik-e a film a dummy mátrixban
    if movie_id in r_matrix:
        
        # A felhasználó és a többi felhasználó közötti koszinusz hasonlóság lekérése
        sim_scores = cosine_sim[user_id]
        
        # A film felhasználói értékelésének lekérése
        m_ratings = r_matrix[movie_id]
        
        # NaN indexek kiindexelése
        idx = m_ratings[m_ratings.isnull()].index
        
        # NaN értékek eldobása
        m_ratings = m_ratings.dropna()
        
        # A megfelelő koszinusz-hasonlósági pontok eldobása
        sim_scores = sim_scores.drop(idx)
        
        # Végső súlyozott átlag kiszámítása
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    return wmean_rating

print("Súlyozott átlag modell RMSE:", score(cf_user_wmean))

Súlyozott átlag modell RMSE: 1.023662431714556


### Demográfiai megközelítés #################################################

In [15]:
# Az eredeti movies dataset összekapcsolása a felhasználókkal 
merged_df = pd.merge(X_train, users)

merged_df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,862,177,4,25,M,executive,13820
1,862,416,3,25,M,executive,13820
2,862,1093,5,25,M,executive,13820
3,862,168,4,25,M,executive,13820
4,862,568,3,25,M,executive,13820


### Nem szerinti átlagos értékeléseket kiszámítani

In [16]:
gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

# A users táblában az indexet a user_id-ra állítani
users = users.set_index('user_id')

users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


### Nem alapú kollaboratív szűrés átlagos értékelések felhasználásával

In [17]:
def cf_gender(user_id, movie_id):
    
    # Létezik-e a film?
    if movie_id in r_matrix:
        # A felhasználó nemének beazonosítása
        gender = users.loc[user_id]['sex']
        
        # Az ő neme értékelte a filmet?
        if gender in gender_mean[movie_id]:
            
            # Az ő neme szerinti értékelések leszűrése
            gender_rating = gender_mean[movie_id][gender]
        
        else:
            gender_rating = 3.0
    
    else:
        # Az alap értékelés 3-ra állítása
        gender_rating = 3.0
    
    return gender_rating


print("Nem alapú kollaboratív szűrés:", score(cf_gender))

Nem alapú kollaboratív szűrés: 1.0392906999935203


### Demográfiai megközelítés foglalkozásokkal

In [18]:
# Átlagos értékelés nem és foglalkozás szerint
gen_occ_mean = merged_df[['sex', 'rating', 'movie_id', 'occupation']].pivot_table(
    values='rating', index='movie_id', columns=['occupation', 'sex'], aggfunc='mean')

gen_occ_mean.head()

occupation,administrator,administrator,artist,artist,doctor,educator,educator,engineer,engineer,entertainment,...,salesman,salesman,scientist,scientist,student,student,technician,technician,writer,writer
sex,F,M,F,M,M,F,M,F,M,F,...,F,M,F,M,F,M,F,M,F,M
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,3.9375,3.75,5.0,3.4,3.666667,3.25,3.884615,4.0,4.083333,4.0,...,,4.0,3.5,4.0,4.043478,3.796296,4.0,3.75,4.0,3.0
2,3.0,3.666667,,,,4.0,3.5,,3.066667,,...,,,,3.0,2.666667,3.277778,,2.714286,,2.333333
3,3.5,4.0,,,,,2.0,,3.777778,,...,,,,,3.0,3.391304,,4.25,,1.0
4,3.666667,3.6,,4.666667,3.0,2.5,3.8,4.0,3.65,,...,4.0,4.0,,3.4,3.25,3.777778,,3.333333,4.25,3.25
5,4.0,2.333333,,,,4.0,2.333333,,3.5,,...,,,,4.0,4.333333,3.111111,,3.333333,4.0,2.0


### Nem és foglalkozás alapú kollaboratív szűrés

In [19]:
def cf_gen_occ(user_id, movie_id):
    if movie_id in gen_occ_mean.index:
        user = users.loc[user_id]
        gender = user['sex']
        occ = user['occupation']
        
        # A foglalkozás értékelte a filmet?
        if occ in gen_occ_mean.loc[movie_id]:
            
            # A nem értékelte a filmet?
            if gender in gen_occ_mean.loc[movie_id][occ]:
                
                # Szükséges értékelés lekérése
                rating = gen_occ_mean.loc[movie_id][occ][gender]
                
                # 3 alapértékre állítás
                if np.isnan(rating):
                    rating = 3.0
                return rating
    return 3.0

print("Nem alapú kollaboratív szűrés:", score(cf_gen_occ))

Nem alapú kollaboratív szűrés: 1.1419651376788005


## Modellalapú megközelítés

In [20]:
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate

# A Reader objektum segít bejárni a string fájlokat, dataframeket
reader = Reader()

# A szűréshez szükséges dataset létrehozása
data = Dataset.load_from_df(ratings, reader)

# KNN objektum létrehozása
knn = KNNBasic()

# KNN kiértékelése
cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9732  0.9747  0.9828  0.9861  0.9789  0.9791  0.0048  
MAE (testset)     0.7722  0.7701  0.7752  0.7775  0.7714  0.7733  0.0027  
Fit time          0.11    0.11    0.12    0.11    0.11    0.11    0.01    
Test time         1.26    1.24    1.36    1.23    1.26    1.27    0.04    


{'test_rmse': array([0.97316683, 0.97468178, 0.98283294, 0.98605415, 0.97891678]),
 'test_mae': array([0.77222017, 0.77008393, 0.77524458, 0.77753006, 0.77144246]),
 'fit_time': (0.10642576217651367,
  0.10848283767700195,
  0.12154698371887207,
  0.10788297653198242,
  0.10804510116577148),
 'test_time': (1.2631962299346924,
  1.2407290935516357,
  1.3552000522613525,
  1.2321090698242188,
  1.2577781677246094)}

## Modellalapú megközelítés: SVD

In [21]:
#Import SVD
from surprise import SVD

#Define the SVD algorithm object
svd = SVD()

#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9352  0.9438  0.9373  0.9367  0.9285  0.9363  0.0049  
MAE (testset)     0.7378  0.7459  0.7371  0.7377  0.7319  0.7381  0.0045  
Fit time          0.42    0.41    0.38    0.39    0.47    0.41    0.03    
Test time         0.05    0.05    0.09    0.05    0.06    0.06    0.02    


{'test_rmse': array([0.9352099 , 0.94381817, 0.93731615, 0.93668339, 0.92847076]),
 'test_mae': array([0.73784023, 0.74590275, 0.73707661, 0.73772121, 0.73192846]),
 'fit_time': (0.4151170253753662,
  0.4133641719818115,
  0.3833925724029541,
  0.386918306350708,
  0.46921205520629883),
 'test_time': (0.05009198188781738,
  0.05044889450073242,
  0.09171390533447266,
  0.050818681716918945,
  0.05616879463195801)}