Przypadki testowe:

dla małej i dużej liczby filmów

dla małej i dużej liczby użytkowników

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
from time import time
import psutil
from surprise import SVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

ml_rating_path = 'dane/ml-20m/ml-20m/ratings.csv'
ml_movies_path = 'dane/ml-20m/ml-20m/movies.csv'
ml_tags_path = 'dane/ml-20m/ml-20m/tags.csv'

# Funkcja do ograniczenia danych
def reduce_ratings(data, reduction_rate):
    #Usuwa określony procent ocen z danych.
    data = data.sample(frac=(1 - reduction_rate), random_state=42).reset_index(drop=True)
    return data

# Funkcja do oceny metryk
def calculate_metrics(predictions, threshold=4.0):
    #Oblicza precision, recall i F1-score.
    y_true = []
    y_pred = []
    for pred in predictions:
        true_rating = pred.r_ui
        predicted_rating = pred.est
        y_true.append(1 if true_rating >= threshold else 0)
        y_pred.append(1 if predicted_rating >= threshold else 0)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return precision, recall, f1


In [9]:
#Eksperyment: Algorytm Collaborative filtering oparty na pamięci

# Parametry eksperymentu
user_sizes = [100] #100, 10000, 100000
reduction_rates = [0.5]  # Usuwanie 10%, 25%, 50% danych  0.25, 0.5

# Wczytanie danych z MovieLens
df = pd.read_csv(ml_rating_path)  # Zmień na odpowiednią ścieżkę
reader = Reader(rating_scale=(0.5, 5.0))

# Eksperyment
results = []
for user_size in user_sizes:
    # Ogranicz liczbę użytkowników
    sampled_users = df['userId'].unique()[:user_size]
    filtered_data = df[df['userId'].isin(sampled_users)]
    
    for reduction_rate in reduction_rates:
        print(f"User Size: {user_size}, Reduction Rate: {reduction_rate}")
        reduced_data = reduce_ratings(filtered_data, reduction_rate)
        
        # Przygotowanie danych do modelu
        data = Dataset.load_from_df(reduced_data[['userId', 'movieId', 'rating']], reader)
        trainset, testset = train_test_split(data, test_size=0.25, random_state=42)
        
        # Tworzenie modelu Collaborative Filtering opartego na użytkownikach
        sim_options = {'name': 'cosine', 'user_based': True}
        model = KNNBasic(sim_options=sim_options)
        
        # Pomiar czasu i zasobów
        start_time = time()
        model.fit(trainset)
        predictions = model.test(testset)
        end_time = time()
        
        # Obliczenie metryk
        precision, recall, f1 = calculate_metrics(predictions)
        
        # Pomiar zasobów
        memory_usage = psutil.virtual_memory().percent
        cpu_usage = psutil.cpu_percent(interval=0.1)
        
        # Zapis wyników
        results.append({
            'user_size': user_size,
            'reduction_rate': reduction_rate,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'time s': end_time - start_time,
            'memory_usage %': memory_usage,
            'cpu_usage %': cpu_usage
        })

# Wyświetlenie wyników
results_df = pd.DataFrame(results)
print(results_df)

User Size: 100, Reduction Rate: 0.1
Computing the cosine similarity matrix...
Done computing similarity matrix.
   user_size  reduction_rate  precision    recall  f1_score    time s  \
0        100             0.1   0.668719  0.384289   0.48809  0.103876   

   memory_usage %  cpu_usage %  
0            67.1         53.6  


In [15]:
#Eksperyment: Algorytm Collaborative filtering oparty na modelu

# Parametry eksperymentu
user_sizes = [100]  # Mały zbiór użytkowników (można rozszerzyć na 10000, 100000)
reduction_rates = [0.1, 0.25, 0.5]  # Ograniczenie danych (np. 10%, 25%, 50%)

# Wczytanie danych z MovieLens
df = pd.read_csv(ml_rating_path)
reader = Reader(rating_scale=(0.5, 5.0))

# Eksperyment
results = []
for user_size in user_sizes:
    # Ogranicz liczbę użytkowników
    sampled_users = df['userId'].unique()[:user_size]
    filtered_data = df[df['userId'].isin(sampled_users)]
    
    for reduction_rate in reduction_rates:
        print(f"User Size: {user_size}, Reduction Rate: {reduction_rate}")
        reduced_data = reduce_ratings(filtered_data, reduction_rate)
        
        # Przygotowanie danych do modelu
        data = Dataset.load_from_df(reduced_data[['userId', 'movieId', 'rating']], reader)
        trainset, testset = train_test_split(data, test_size=0.25, random_state=42)
        
        # Tworzenie modelu Collaborative Filtering opartego na modelu (SVD)
        model = SVD()
        
        # Pomiar czasu i zasobów
        start_time = time()
        model.fit(trainset)
        predictions = model.test(testset)
        end_time = time()
        
        # Obliczenie metryk
        precision, recall, f1 = calculate_metrics(predictions)
        
        # Pomiar zasobów
        memory_usage = psutil.virtual_memory().percent
        cpu_usage = psutil.cpu_percent(interval=0.1)
        
        # Zapis wyników
        results.append({
            'user_size': user_size,
            'reduction_rate': reduction_rate,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'time': end_time - start_time,
            'memory_usage': memory_usage,
            'cpu_usage': cpu_usage
        })

# Wyświetlenie wyników
results_df = pd.DataFrame(results)
print(results_df)

User Size: 100, Reduction Rate: 0.1
User Size: 100, Reduction Rate: 0.25
User Size: 100, Reduction Rate: 0.5
   user_size  reduction_rate  precision    recall  f1_score      time  \
0        100            0.10   0.804702  0.314933  0.452696  0.120677   
1        100            0.25   0.796875  0.309896  0.446250  0.092723   
2        100            0.50   0.774809  0.262274  0.391892  0.047433   

   memory_usage  cpu_usage  
0          64.7       42.9  
1          64.6       17.9  
2          64.5        3.6  


In [38]:
#Eksperyment: Algorytm Content-based filtering
from sklearn.model_selection import train_test_split as sklearn_train_test_split

# Funkcja do oceny metryk
def evaluate_content_based(recommendations, test_data, threshold=0.0):
    y_true = []
    y_pred = []

    for user_id, recommended_movies in recommendations.items():
        test_movies = test_data[test_data['userId'] == user_id]
        test_movies = test_movies[test_movies['rating'] >= threshold]['movieId'].values

        true_set = set(test_movies)
        pred_set = set(movie for movie, _ in recommended_movies)

        for movie in pred_set:
            y_pred.append(1)
            y_true.append(1 if movie in true_set else 0)

        for movie in true_set - pred_set:
            y_true.append(1)
            y_pred.append(0)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return precision, recall, f1

# Algorytm Content-Based Filtering
def content_based_filtering(data, movie_df, use_tags=True):
    # Połączenie tagów i gatunków w jedną kolumnę
    movie_df['combined_features'] = movie_df['tags'].fillna('') + ' ' + movie_df['genres'].fillna('')

    # TF-IDF na połączonych cechach
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(movie_df['combined_features'])

    # Macierz podobieństwa kosinusowego
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    recommendations = {}
    for user_id in data['userId'].unique():
        rated_movies = data[data['userId'] == user_id]
        user_ratings = rated_movies.set_index('movieId')['rating']

        movie_scores = {}
        for movie_id, rating in user_ratings.items():
            if movie_id - 1 < len(cosine_sim):  # Sprawdzenie zakresu
                similar_movies = cosine_sim[movie_id - 1]
                for idx, score in enumerate(similar_movies):
                    if movie_id != idx + 1 and idx + 1 not in rated_movies['movieId'].values:
                        movie_scores[idx + 1] = movie_scores.get(idx + 1, 0) + score * rating

        sorted_movie_scores = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
        recommendations[user_id] = sorted_movie_scores[:100]
    return recommendations

ratings = pd.read_csv(ml_rating_path)
movies = pd.read_csv(ml_movies_path)
tags = pd.read_csv(ml_tags_path)

# Dodanie tagów do filmów
tags['tag'] = tags['tag'].fillna('').astype(str)
tag_groups = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies = movies.merge(tag_groups, on='movieId', how='left')
movies['tags'] = movies['tag']  # Kolumna z tagami

# Parametry eksperymentu
user_sizes = [10]  # Liczba użytkowników
movie_sizes = [50, 1000, 10000]  # Liczba filmów do analizy
results = []

# Eksperyment
for user_size in user_sizes:
    sampled_users = ratings['userId'].unique()[:user_size]
    filtered_data = ratings[ratings['userId'].isin(sampled_users)]

    for movie_size in movie_sizes:
        sampled_movies = movies.sample(n=movie_size, random_state=42)
        filtered_data_movies = filtered_data[filtered_data['movieId'].isin(sampled_movies['movieId'])]

        print(f"User Size: {user_size}, Movie Size: {movie_size}")
        
        # Podział na dane treningowe i testowe
        train_data, test_data = sklearn_train_test_split(filtered_data_movies, test_size=0.4, random_state=42)

        # Uruchomienie algorytmu Content-Based Filtering
        start_time = time()
        recommendations = content_based_filtering(train_data, sampled_movies, use_tags=True)
        elapsed_time = time() - start_time

        # Obliczanie metryk
        precision, recall, f1 = evaluate_content_based(recommendations, test_data)

        # Pomiar zasobów
        memory_usage = psutil.virtual_memory().percent
        cpu_usage = psutil.cpu_percent(interval=0.1)

        # Zapis wyników
        results.append({
            'user_size': user_size,
            'movie_size': movie_size,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'time': elapsed_time,
            'memory_usage': memory_usage,
            'cpu_usage': cpu_usage
        })

# Wyświetlenie wyników
results_df = pd.DataFrame(results)
print(results_df)

User Size: 10, Movie Size: 50
User Size: 10, Movie Size: 1000
User Size: 10, Movie Size: 10000
   user_size  movie_size  precision    recall  f1_score       time  \
0         10          50      0.000  0.000000  0.000000   0.005981   
1         10        1000      0.000  0.000000  0.000000   0.110673   
2         10       10000      0.002  0.013514  0.003484  15.964484   

   memory_usage  cpu_usage  
0          52.8        4.2  
1          52.8       35.7  
2          52.8        0.0  


In [2]:
#Eksperyment: Algorytm Content-based filtering próba 2

from sklearn.model_selection import train_test_split

# Wczytanie danych
ratings = pd.read_csv(ml_rating_path)
movies = pd.read_csv(ml_movies_path)
tags = pd.read_csv(ml_tags_path)

# Przygotowanie cech filmów: gatunki + tagi
movies['genres'] = movies['genres'].fillna('')
tags['tag'] = tags['tag'].fillna('')

# Grupowanie tagów według movieId i łączenie ich w jeden ciąg
tags_combined = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Łączenie danych
movies = movies.merge(tags_combined, on='movieId', how='left')

# Tworzenie cechy 'features' (gatunki + tagi)
movies['features'] = movies['genres'] + ' ' + movies['tag']

# Upewniamy się, że w kolumnie 'features' nie ma wartości NaN
movies['features'] = movies['features'].fillna('')

# Reprezentacja filmów jako wektory przy użyciu TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies['features'])

# Obliczanie podobieństwa między filmami
cosine_sim = cosine_similarity(tfidf_matrix)

# Podział danych na zbiór treningowy i testowy (np. 80% treningowe, 20% testowe)
ratings_train, ratings_test = train_test_split(ratings, test_size=0.2, random_state=42)

# Funkcja do tworzenia profilu użytkownika (na danych treningowych)
def create_user_profile(user_id, ratings, cosine_sim, movies):
    rated_movies = ratings[ratings['userId'] == user_id]
    rated_movie_ids = rated_movies['movieId'].tolist()
    
    # Tworzymy profil jako ważoną średnią cech ocenionych filmów
    weighted_features = np.zeros(tfidf_matrix.shape[1])  # Wektor z zerami o takim samym rozmiarze, jak cechy filmów
    total_weight = 0
    for movie_id, rating in zip(rated_movies['movieId'], rated_movies['rating']):
        idx = movies[movies['movieId'] == movie_id].index[0]
        
        # Zamiast cosine_sim[idx] * rating, możemy po prostu dodać cechy filmów pomnożone przez ocenę
        weighted_features += tfidf_matrix[idx].toarray().flatten() * rating  # Convert sparse matrix row to dense array
        total_weight += rating
    
    if total_weight > 0:
        weighted_features /= total_weight  # Normalizacja, aby uzyskać średnią ważoną
    
    return weighted_features

# Funkcja do generowania rekomendacji na podstawie profilu użytkownika
def get_recommendations(user_id, ratings, cosine_sim, movies, top_n=10):
    # Tworzymy profil użytkownika na danych treningowych
    user_profile = create_user_profile(user_id, ratings, cosine_sim, movies)
    
    # Obliczamy podobieństwo profilu użytkownika do każdego filmu
    similarities = cosine_similarity([user_profile], tfidf_matrix)[0]
    
    # Sortujemy filmy po podobieństwie (najwyższe na początku)
    similar_movie_indices = similarities.argsort()[-top_n-1:-1][::-1]  # Pomijamy już ocenione filmy
    recommended_movie_ids = movies.iloc[similar_movie_indices]['movieId'].tolist()
    
    return recommended_movie_ids

# Funkcja do obliczania metryk
def calculate_metrics(recommended_movies, ratings_test):
    y_true = []
    y_pred = []
    
    for movie in recommended_movies:
        # Sprawdzamy, czy film został oceniony przez użytkownika w zbiorze testowym
        actual_ratings = ratings_test[ratings_test['movieId'] == movie]
        
        # Jeżeli film został oceniony przez użytkownika na 4 lub więcej, uznajemy go za interesujący
        if not actual_ratings.empty:
            user_rating = actual_ratings['rating'].iloc[0]
            y_true.append(1 if user_rating >= 4 else 0)
        else:
            y_true.append(0)
        
        # Zawsze zakładamy, że rekomendacja jest pozytywna (1)
        y_pred.append(1)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return precision, recall, f1

# Funkcja do przeprowadzenia eksperymentu
def run_experiment(ratings_train, ratings_test, cosine_sim, user_sample_size=10, movie_sample_size=100, top_n=10):
    # Tworzymy próbkę użytkowników
    users_sample = ratings_train['userId'].drop_duplicates().sample(user_sample_size)
    
    # Generowanie rekomendacji dla użytkowników w zbiorze testowym
    all_recommendations = []
    all_actual_movies = []
    
    for user in users_sample:
        recommended_movies = get_recommendations(user, ratings_train, cosine_sim, movies, top_n=top_n)
        
        # Filmy ocenione przez użytkownika w zbiorze testowym
        actual_movies = ratings_test[ratings_test['userId'] == user]['movieId'].tolist()
        
        all_recommendations.extend(recommended_movies)
        all_actual_movies.extend(actual_movies)

    # Obliczanie metryk
    precision, recall, f1 = calculate_metrics(all_recommendations, ratings_test)

    # Mierzenie zużycia pamięci
    process = psutil.Process()
    memory_usage = process.memory_info().rss / (1024 * 1024)  # w MB

    return precision, recall, f1, memory_usage

# Przeprowadzanie eksperymentu
user_sample_sizes = [10, 100, 1000]  # Liczba użytkowników
movie_sample_sizes = [100, 5000, 20000]  # Liczba filmów

# Pętla do uruchomienia eksperymentu dla różnych liczby użytkowników i filmów
for user_sample_size in user_sample_sizes:
    for movie_sample_size in movie_sample_sizes:
        print(f"Running experiment with user_sample_size={user_sample_size} and movie_sample_size={movie_sample_size}")
        
        # Uruchamiamy eksperyment
        precision, recall, f1, memory_usage = run_experiment(
            ratings_train=ratings_train, ratings_test=ratings_test, 
            cosine_sim=cosine_sim, 
            user_sample_size=user_sample_size, 
            movie_sample_size=movie_sample_size
        )

        # Wyświetlanie wyników
        print(f"Precision: {precision}, Recall: {recall}, F1: {f1}, Memory Usage: {memory_usage} MB")

Running experiment with user_sample_size=10 and movie_sample_size=100
actual_movies56
recommended_movies10
actual_movies19
recommended_movies10
actual_movies16
recommended_movies10
actual_movies33
recommended_movies10
actual_movies17
recommended_movies10
actual_movies6
recommended_movies10
actual_movies15
recommended_movies10
actual_movies8
recommended_movies10
actual_movies3
recommended_movies10
actual_movies11
recommended_movies10
Precision: 0.46, Recall: 1.0, F1: 0.6301369863013699, Memory Usage: 5792.2421875 MB
Running experiment with user_sample_size=10 and movie_sample_size=5000
actual_movies47
recommended_movies10
actual_movies18
recommended_movies10
actual_movies14
recommended_movies10
actual_movies52
recommended_movies10
actual_movies30
recommended_movies10
actual_movies19
recommended_movies10
actual_movies28
recommended_movies10
actual_movies15
recommended_movies10
actual_movies13
recommended_movies10
actual_movies77
recommended_movies10
Precision: 0.55, Recall: 1.0, F1: 0.70

actual_movies11
recommended_movies10
actual_movies12
recommended_movies10
actual_movies26
recommended_movies10
actual_movies15
recommended_movies10
actual_movies207
recommended_movies10
actual_movies4
recommended_movies10
actual_movies79
recommended_movies10
actual_movies222
recommended_movies10
actual_movies10
recommended_movies10
actual_movies32
recommended_movies10
actual_movies5
recommended_movies10
actual_movies7
recommended_movies10
actual_movies146
recommended_movies10
actual_movies11
recommended_movies10
actual_movies44
recommended_movies10
actual_movies5
recommended_movies10
actual_movies16
recommended_movies10
actual_movies29
recommended_movies10
actual_movies47
recommended_movies10
actual_movies3
recommended_movies10
actual_movies15
recommended_movies10
actual_movies8
recommended_movies10
actual_movies7
recommended_movies10
actual_movies100
recommended_movies10
actual_movies46
recommended_movies10
Precision: 0.522, Recall: 1.0, F1: 0.6859395532194481, Memory Usage: 5792.6171

KeyboardInterrupt: 