Przypadki testowe:

dla małej i dużej liczby filmów

dla małej i dużej liczby użytkowników

In [18]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
from time import time
import psutil
from surprise import SVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import os

ml_rating_path = 'dane/ml-20m/ml-20m/ratings.csv'
ml_movies_path = 'dane/ml-20m/ml-20m/movies.csv'
ml_tags_path = 'dane/ml-20m/ml-20m/tags.csv'

# Funkcja do ograniczenia liczby filmów
def reduce_movies(data, movie_count):
    sampled_movies = data['movieId'].unique()[:movie_count]
    reduced_data = data[data['movieId'].isin(sampled_movies)]
    return reduced_data

# Funkcja do obliczania metryk
def calculate_metrics(predictions, threshold=4.0):
    y_true = []
    y_pred = []
    for pred in predictions:
        true_rating = pred.r_ui
        predicted_rating = pred.est
        y_true.append(1 if true_rating >= threshold else 0)
        y_pred.append(1 if predicted_rating >= threshold else 0)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return precision, recall, f1

In [19]:
#Eksperyment: Algorytm Collaborative filtering oparty na pamięci

# Parametry eksperymentu
user_sizes = [50, 1000]  # Liczba użytkowników do testowania
movie_sizes = [100, 1000, 5000]  # Liczba filmów do testowania

# Wczytanie danych z MovieLens
df = pd.read_csv(ml_rating_path)
reader = Reader(rating_scale=(0.5, 5.0))

# Eksperyment
results = []
for user_size in user_sizes:
    # Ogranicz liczbę użytkowników
    sampled_users = df['userId'].unique()[:user_size]
    filtered_data = df[df['userId'].isin(sampled_users)]
    
    for movie_size in movie_sizes:
        print(f"User Size: {user_size}, Movie Count: {movie_size}")
        reduced_data = reduce_movies(filtered_data, movie_size)
        
        # Przygotowanie danych do modelu
        data = Dataset.load_from_df(reduced_data[['userId', 'movieId', 'rating']], reader)
        trainset, testset = train_test_split(data, test_size=0.25, random_state=42)
        
        # Tworzenie modelu Collaborative Filtering opartego na użytkownikach
        sim_options = {'name': 'cosine', 'user_based': True}
        model = KNNBasic(sim_options=sim_options)
        
        # Pomiar czasu i zasobów
        start_time = time()
        model.fit(trainset)
        predictions = model.test(testset)
        end_time = time()
        
        # Obliczenie metryk
        precision, recall, f1 = calculate_metrics(predictions)
        
        # Pomiar zużycia pamięci
        process = psutil.Process(os.getpid())
        memory_usage = process.memory_info().rss / (1024 * 1024)  # Pamięć w MB
        
        # Zapis wyników
        results.append({
            'user_size': user_size,
            'movie_count': movie_size,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'execution_time (s)': end_time - start_time,
            'memory_usage (MB)': memory_usage
        })

# Wyświetlenie wyników
results_df = pd.DataFrame(results)
print(results_df)

User Size: 50, Movie Count: 100
Computing the cosine similarity matrix...
Done computing similarity matrix.
User Size: 50, Movie Count: 1000
Computing the cosine similarity matrix...
Done computing similarity matrix.
User Size: 50, Movie Count: 5000
Computing the cosine similarity matrix...
Done computing similarity matrix.
User Size: 1000, Movie Count: 100
Computing the cosine similarity matrix...
Done computing similarity matrix.
User Size: 1000, Movie Count: 1000
Computing the cosine similarity matrix...
Done computing similarity matrix.
User Size: 1000, Movie Count: 5000
Computing the cosine similarity matrix...
Done computing similarity matrix.
   user_size  movie_count  precision    recall  f1_score  time (s)  \
0         50          100   0.725275  0.628571  0.673469  0.062355   
1         50         1000   0.643750  0.448802  0.528883  0.011994   
2         50         5000   0.649165  0.379358  0.478873  0.015961   
3       1000          100   0.740245  0.575639  0.647647  0.69

In [16]:
#Eksperyment: Algorytm Collaborative filtering oparty na modelu

# Parametry eksperymentu
user_sizes = [50, 1000, 5000]  # Liczba użytkowników do testowania
movie_sizes = [100, 1000, 5000]  # Liczba filmów do testowania

# Wczytanie danych z MovieLens
df = pd.read_csv(ml_rating_path)
reader = Reader(rating_scale=(0.5, 5.0))

# Eksperyment
results = []
for user_size in user_sizes:
    # Ogranicz liczbę użytkowników
    sampled_users = df['userId'].unique()[:user_size]
    filtered_data = df[df['userId'].isin(sampled_users)]
    
    for movie_size in movie_sizes:
        print(f"User Size: {user_size}, Movie Count: {movie_size}")
        reduced_data = reduce_movies(filtered_data, movie_size)
        
        # Przygotowanie danych do modelu
        data = Dataset.load_from_df(reduced_data[['userId', 'movieId', 'rating']], reader)
        trainset, testset = train_test_split(data, test_size=0.25, random_state=42)
        
        # Tworzenie modelu Collaborative Filtering opartego na SVD
        model = SVD()
        
        # Pomiar czasu i zasobów
        start_time = time()
        model.fit(trainset)
        predictions = model.test(testset)
        end_time = time()
        
        # Obliczenie metryk
        precision, recall, f1 = calculate_metrics(predictions)
        
        # Pomiar zasobów (pamięci w MB)
        process = psutil.Process(os.getpid())
        memory_usage = process.memory_info().rss / (1024 * 1024)  # Pamięć w MB
        
        # Zapis wyników
        results.append({
            'user_size': user_size,
            'movie_count': movie_size,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'execution_time (s)': end_time - start_time,
            'memory_usage (MB)': memory_usage  # Zużycie pamięci w MB
        })

# Wyświetlenie wyników
results_df = pd.DataFrame(results)
print(results_df)

User Size: 50, Movie Count: 100
User Size: 50, Movie Count: 1000
User Size: 50, Movie Count: 5000
User Size: 1000, Movie Count: 100
User Size: 1000, Movie Count: 1000
User Size: 1000, Movie Count: 5000
User Size: 5000, Movie Count: 100
User Size: 5000, Movie Count: 1000
User Size: 5000, Movie Count: 5000
   user_size  movie_count  precision    recall  f1_score      time  \
0         50          100   0.786667  0.561905  0.655556  0.054850   
1         50         1000   0.709302  0.398693  0.510460  0.043870   
2         50         5000   0.789916  0.262204  0.393717  0.061828   
3       1000          100   0.824080  0.572605  0.675703  0.107659   
4       1000         1000   0.814847  0.395926  0.532914  0.667097   
5       1000         5000   0.820089  0.351249  0.491841  1.516564   
6       5000          100   0.840692  0.577434  0.684628  0.706231   
7       5000         1000   0.838306  0.441986  0.578805  3.643217   
8       5000         5000   0.833353  0.404854  0.544960  7.0557

User Size: 10, Movie Size: 50
User Size: 10, Movie Size: 1000
User Size: 10, Movie Size: 10000
   user_size  movie_size  precision    recall  f1_score       time  \
0         10          50      0.000  0.000000  0.000000   0.005981   
1         10        1000      0.000  0.000000  0.000000   0.110673   
2         10       10000      0.002  0.013514  0.003484  15.964484   

   memory_usage  cpu_usage  
0          52.8        4.2  
1          52.8       35.7  
2          52.8        0.0  


In [20]:
#Eksperyment: Algorytm Content-based filtering

from sklearn.model_selection import train_test_split

ml_rating_path = 'dane/ml-20m/ml-20m/ratings.csv'
ml_movies_path = 'dane/ml-20m/ml-20m/movies.csv'
ml_tags_path = 'dane/ml-20m/ml-20m/tags.csv'

# Wczytanie danych
ratings = pd.read_csv(ml_rating_path)
movies = pd.read_csv(ml_movies_path)
tags = pd.read_csv(ml_tags_path)

# Przygotowanie cech filmów: gatunki + tagi
movies['genres'] = movies['genres'].fillna('')
tags['tag'] = tags['tag'].fillna('')

# Grupowanie tagów według movieId i łączenie ich w jeden ciąg
tags_combined = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Łączenie danych
movies = movies.merge(tags_combined, on='movieId', how='left')

# Tworzenie cechy 'features' (gatunki + tagi)
movies['features'] = movies['genres'] + ' ' + movies['tag']
movies['features'] = movies['features'].fillna('')  # Uzupełnianie brakujących wartości

# Filtrowanie ocen, aby pasowały do istniejących filmów
ratings = ratings[ratings['movieId'].isin(movies['movieId'])]

# Podział danych na zbiór treningowy i testowy
ratings_train, ratings_test = train_test_split(ratings, test_size=0.2, random_state=42)

# Funkcja do tworzenia profilu użytkownika (na danych treningowych)
def create_user_profile(user_id, ratings, cosine_sim, movies_sample, tfidf_matrix):
    rated_movies = ratings[ratings['userId'] == user_id]
    weighted_features = np.zeros(tfidf_matrix.shape[1])  # Wektor cech filmów
    total_weight = 0
    
    for movie_id, rating in zip(rated_movies['movieId'], rated_movies['rating']):
        # Znajdowanie indeksu w nowej próbce
        movie_idx = movies_sample[movies_sample['movieId'] == movie_id].index
        if not movie_idx.empty:  # Film istnieje w ograniczonej próbce
            idx = movie_idx[0]  # Indeks w macierzy TF-IDF
            weighted_features += tfidf_matrix[idx].toarray().flatten() * rating
            total_weight += rating
    
    if total_weight > 0:
        weighted_features /= total_weight  # Normalizacja
    
    return weighted_features

# Funkcja do generowania rekomendacji na podstawie profilu użytkownika
def get_recommendations(user_id, ratings, cosine_sim, movies_sample, tfidf_matrix, top_n=10):
    user_profile = create_user_profile(user_id, ratings, cosine_sim, movies_sample, tfidf_matrix)
    
    # Obliczamy podobieństwo profilu użytkownika do każdego filmu
    similarities = cosine_similarity([user_profile], tfidf_matrix)[0]
    
    # Sortujemy filmy po podobieństwie
    similar_movie_indices = similarities.argsort()[-top_n-1:-1][::-1]
    recommended_movie_ids = movies_sample.iloc[similar_movie_indices]['movieId'].tolist()
    
    return recommended_movie_ids

# Funkcja do obliczania metryk
def calculate_metrics(recommended_movies, ratings_test):
    y_true = []
    y_pred = []
    
    for movie in recommended_movies:
        actual_ratings = ratings_test[ratings_test['movieId'] == movie]
        if not actual_ratings.empty:
            user_rating = actual_ratings['rating'].iloc[0]
            y_true.append(1 if user_rating >= 4 else 0)
        else:
            y_true.append(0)
        y_pred.append(1)  # Zawsze zakładamy, że rekomendacja jest pozytywna
    
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)

    return precision, recall, f1

# Funkcja do przeprowadzenia eksperymentu
def run_experiment(ratings_train, ratings_test, movies, movie_sample_size, user_sample_size=10, top_n=10):
    # Losowe próbkowanie filmów
    movies_sample = movies.sample(n=movie_sample_size, random_state=42).reset_index(drop=True)
    
    # Macierz podobieństw dla próbki filmów
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(movies_sample['features'])
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Tworzymy próbkę użytkowników
    users_sample = ratings_train['userId'].drop_duplicates().sample(user_sample_size, random_state=42)
    
    # Generowanie rekomendacji dla użytkowników
    all_recommendations = []
    all_actual_movies = []
    
    for user in users_sample:
        recommended_movies = get_recommendations(user, ratings_train, cosine_sim, movies_sample, tfidf_matrix, top_n=top_n)
        actual_movies = ratings_test[ratings_test['userId'] == user]['movieId'].tolist()
        
        all_recommendations.extend(recommended_movies)
        all_actual_movies.extend(actual_movies)

    # Obliczanie metryk
    precision, recall, f1 = calculate_metrics(all_recommendations, ratings_test)

    # Zużycie pamięci
    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / (1024 * 1024)

    return precision, recall, f1, memory_usage

# Przeprowadzanie eksperymentu
user_sample_sizes = [100, 1000]  # Liczba użytkowników
movie_sample_sizes = [100, 5000, 20000]  # Liczba filmów

# Zapisanie wyników eksperymentu
results = []

for user_sample_size in user_sample_sizes:
    for movie_sample_size in movie_sample_sizes:
        print(f"Running experiment with user_sample_size={user_sample_size} and movie_sample_size={movie_sample_size}")
        
        # Mierzymy czas przed rozpoczęciem eksperymentu
        start_time = time()

        # Uruchamiamy eksperyment
        precision, recall, f1, memory_usage = run_experiment(
            ratings_train=ratings_train, 
            ratings_test=ratings_test, 
            movies=movies, 
            movie_sample_size=movie_sample_size, 
            user_sample_size=user_sample_size
        )

        # Mierzymy czas po zakończeniu eksperymentu
        end_time = time()
        
        #Zapis wyników
        results.append({
            'user_sample_size': user_sample_size,
            'movie_sample_size': movie_sample_size,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'memory_usage (MB)': memory_usage,
            'execution_time (s)': end_time - start_time
        })

# Wyświetlanie wyników
results_df = pd.DataFrame(results)
print(results_df)

Running experiment with user_sample_size=100 and movie_sample_size=100
Running experiment with user_sample_size=100 and movie_sample_size=5000
Running experiment with user_sample_size=100 and movie_sample_size=20000
Running experiment with user_sample_size=1000 and movie_sample_size=100
Running experiment with user_sample_size=1000 and movie_sample_size=5000
Running experiment with user_sample_size=1000 and movie_sample_size=20000
   user_sample_size  movie_sample_size  precision  recall  f1_score  \
0               100                100     0.1980     1.0  0.330551   
1               100               5000     0.4270     1.0  0.598458   
2               100              20000     0.4490     1.0  0.619738   
3              1000                100     0.1936     1.0  0.324397   
4              1000               5000     0.4172     1.0  0.588767   
5              1000              20000     0.4765     1.0  0.645445   

   memory_usage (MB)  execution_time (s)  
0        3521.687500    

In [43]:
#Eksperyment: Algorytm Hybrid Filtering łączący Content-based filtering i collabroative filtering oparty na modelu. Ważona

import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
from time import time
import psutil
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# Wczytanie danych
ml_rating_path = 'dane/ml-20m/ml-20m/ratings.csv'
ml_movies_path = 'dane/ml-20m/ml-20m/movies.csv'
ml_tags_path = 'dane/ml-20m/ml-20m/tags.csv'

ratings = pd.read_csv(ml_rating_path)
movies = pd.read_csv(ml_movies_path)
tags = pd.read_csv(ml_tags_path)

# Przygotowanie cech filmów: gatunki + tagi
movies['genres'] = movies['genres'].fillna('')
tags['tag'] = tags['tag'].fillna('')

# Grupowanie tagów według movieId i łączenie ich w jeden ciąg
tags_combined = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Łączenie danych
movies = movies.merge(tags_combined, on='movieId', how='left')

# Tworzenie cechy 'features' (gatunki + tagi)
movies['features'] = movies['genres'] + ' ' + movies['tag']
movies['features'] = movies['features'].fillna('')

# Funkcja do ograniczenia liczby filmów
def reduce_movies(data, movie_count):
    sampled_movies = data['movieId'].unique()[:movie_count]
    reduced_data = data[data['movieId'].isin(sampled_movies)]
    return reduced_data

# Funkcja do tworzenia profilu użytkownika (na danych treningowych)
def create_user_profile(user_id, ratings, cosine_sim, movies_sample, tfidf_matrix):
    rated_movies = ratings[ratings['userId'] == user_id]
    weighted_features = np.zeros(tfidf_matrix.shape[1])  # Wektor cech filmów
    total_weight = 0
    
    for movie_id, rating in zip(rated_movies['movieId'], rated_movies['rating']):
        # Znajdowanie indeksu w nowej próbce
        movie_idx = movies_sample[movies_sample['movieId'] == movie_id].index
        if not movie_idx.empty:  # Film istnieje w ograniczonej próbce
            idx = movie_idx[0]  # Indeks w macierzy TF-IDF
            weighted_features += tfidf_matrix[idx].toarray().flatten() * rating
            total_weight += rating
    
    if total_weight > 0:
        weighted_features /= total_weight  # Normalizacja
    
    return weighted_features

# Funkcja do generowania rekomendacji z wagami
def get_hybrid_recommendations(user_id, ratings, cosine_sim, movies_sample, tfidf_matrix, svd_model, w_content=0.8, w_collab=0.2, top_n=10):
    # Generowanie rekomendacji content-based
    user_profile = create_user_profile(user_id, ratings, cosine_sim, movies_sample, tfidf_matrix)
    similarities = cosine_similarity([user_profile], tfidf_matrix)[0]
    content_based_recs = similarities.argsort()[-(top_n*10):][::-1]
    content_based_movie_ids = movies_sample.iloc[content_based_recs]['movieId'].tolist()

    # Generowanie rekomendacji collaborative filtering
    svd_recs = [svd_model.predict(user_id, movie_id).est for movie_id in movies_sample['movieId']]
    svd_movie_ids = np.argsort(svd_recs)[-(top_n*10):][::-1]
    svd_movie_ids = movies_sample.iloc[svd_movie_ids]['movieId'].tolist()

    # Połączenie wyników na podstawie wag
    hybrid_scores = {}
    
    # Łączenie wyników obu metod (Content-based + Collaborative Filtering)
    for movie_id in set(content_based_movie_ids + svd_movie_ids):
        # Content-based score
        content_score = w_content * (similarities[movies_sample['movieId'] == movie_id][0] if movie_id in content_based_movie_ids else 0)
        
        # Collaborative filtering score
        collab_score = 0
        if movie_id in svd_movie_ids:
            idx = movies_sample[movies_sample['movieId'] == movie_id].index[0]
            collab_score = w_collab * svd_recs[idx] / 5

        # Obliczenie łącznego wyniku
        hybrid_scores[movie_id] = content_score + collab_score
    
    # Sortowanie rekomendacji według łącznego wyniku
    sorted_hybrid_recs = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Wybór top_n rekomendacji
    hybrid_recs = [movie_id for movie_id, _ in sorted_hybrid_recs[:top_n]]
    
    return hybrid_recs

# Funkcja do obliczania metryk
def calculate_metrics(recommended_movies, ratings_test):
    y_true = []
    y_pred = []
    
    for movie in recommended_movies:
        actual_ratings = ratings_test[ratings_test['movieId'] == movie]
        if not actual_ratings.empty:
            user_rating = actual_ratings['rating'].iloc[0]
            y_true.append(1 if user_rating >= 4 else 0)
        else:
            y_true.append(0)
        y_pred.append(1)  # Zawsze zakładamy, że rekomendacja jest pozytywna
    
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)

    return precision, recall, f1

# Eksperyment
# Przygotowanie danych do eksperymentu
user_sizes = [50, 1000]  # Liczba użytkowników do testowania
movie_sizes = [100, 1000, 10000]  # Liczba filmów do testowania

results = []

for user_size in user_sizes:
    sampled_users = ratings['userId'].unique()[:user_size]
    filtered_data = ratings[ratings['userId'].isin(sampled_users)]
    
    for movie_size in movie_sizes:
        print(f"User Size: {user_size}, Movie Count: {movie_size}")
        reduced_data = reduce_movies(filtered_data, movie_size)
        
        # Przygotowanie danych do modelu collaborative filtering
        reader = Reader(rating_scale=(0.5, 5.0))
        data = Dataset.load_from_df(reduced_data[['userId', 'movieId', 'rating']], reader)
        trainset, testset = train_test_split(data, test_size=0.25, random_state=42)
        svd_model = SVD()
        svd_model.fit(trainset)
        
        # Przygotowanie danych do content-based filtering
        movies_sample = movies.sample(n=movie_size, random_state=42).reset_index(drop=True)
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(movies_sample['features'])
        
        # Mierzenie czasu i zasobów
        start_time = time()
        
        # Generowanie rekomendacji dla użytkowników
        all_recommendations = []
        all_actual_movies = []
        
        for user_id in sampled_users[:user_size]:
            recommended_movies = get_hybrid_recommendations(user_id, reduced_data, None, movies_sample, tfidf_matrix, svd_model, w_content=0.7, w_collab=0.3, top_n=10)
            actual_movies = filtered_data[filtered_data['userId'] == user_id]['movieId'].tolist()
            
            all_recommendations.extend(recommended_movies)
            all_actual_movies.extend(actual_movies)

        # Obliczanie metryk
        precision, recall, f1 = calculate_metrics(all_recommendations, filtered_data)
        
        # Zużycie pamięci
        process = psutil.Process(os.getpid())
        memory_usage = process.memory_info().rss / (1024 * 1024)  # MB
        
        # Zapis wyników
        end_time = time()
        results.append({
            'user_size': user_size,
            'movie_count': movie_size,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'memory_usage (MB)': memory_usage,
            'execution_time (s)': end_time - start_time
        })

# Wyświetlenie wyników
results_df = pd.DataFrame(results)
print(results_df)


User Size: 50, Movie Count: 100
User Size: 50, Movie Count: 1000
User Size: 50, Movie Count: 10000
User Size: 1000, Movie Count: 100
User Size: 1000, Movie Count: 1000
User Size: 1000, Movie Count: 10000
   user_size  movie_count  precision  recall  f1_score  memory_usage (MB)  \
0         50          100     0.0660     1.0  0.123827        1552.160156   
1         50         1000     0.4680     1.0  0.637602        1554.699219   
2         50        10000     0.8040     1.0  0.891353        1572.222656   
3       1000          100     0.0152     1.0  0.029945        1282.441406   
4       1000         1000     0.4584     1.0  0.628634        1290.082031   
5       1000        10000     0.7108     1.0  0.830956        1307.277344   

   execution_time (s)  
0            1.950281  
1            2.908201  
2            5.517966  
3           46.560652  
4           61.572669  
5          131.547391  
