# Install Requirements

In [None]:
!pip install -r requiremnets.txt

# Base Imports

In [1]:
from evaluation import ModelEvaluator
from models import (
    PopularityRecommendationModel,
    PopularityYearRecommendationModel,
    ContentBasedRecommendationModel,
    CollaborativeRecommendationModel,
    HybridRecommendationModel
)
from sklearn.model_selection import train_test_split
from utils.dataset_utils import (
    preprocess_books,
    preprocess_ratings,
    drop_irrelevant_books
)
from utils.df_utils import (
    load_books_dataset,
    load_ratings_dataset,
    get_users_subset,
    intersect_df
)

# Loading Dataset

## Preprocessing Books Steps.
1. drop nan values.
2. removing books that has year = 0.
3. removing books that has an Unknown Publisher.
4. removing books that has an Unknown Author.
5. drop irrelevant publishers: drop books with publishers that doesn't have a lot of occurrences.
6. drop irrelevant authors: drop books with authors that doesn't have a lot of occurrences.

In [2]:
books, dropped_books = preprocess_books(
    load_books_dataset("dataset/Books.csv"),
    # this one will reduce the number of district authors/publishers
    # values and there for will be having less sparsity when working with
    # their vectors.
    drop_irrelevant_publishers=100,
    drop_irrelevant_authors=50
)

## Loading and Preprocessing Ratings.
## Preprocessing Ratings Steps.
1. drop nan values.
2. drop users that has a total ratings sum of 0.

In [12]:
ratings = load_ratings_dataset("dataset/Ratings.csv", drop_books=dropped_books)

full_train_data, full_test_data = train_test_split(ratings, test_size=0.15)

full_train_data = preprocess_ratings(full_train_data)
full_test_data = preprocess_ratings(full_test_data)

popularity_data = get_users_subset(full_train_data, full_test_data, 3000)
sun_train_popularity_data = popularity_data["train_data"]
sub_test_popularity_data = popularity_data["test_data"]

## Evaluator
the popularity prediction is so fast I want extra marks for that :)

In [13]:
popularity_evaluator = ModelEvaluator(
    training_data=sun_train_popularity_data,
    testing_data=sub_test_popularity_data,
    favourite_threshold=5,
)

In [14]:
popularity_rec_model = PopularityRecommendationModel(
    books, sun_train_popularity_data, threshold=5
)
print(f"{popularity_rec_model.MODEL_NAME} Loaded...")

pop_global_metrics, _ = popularity_evaluator.evaluate_model(popularity_rec_model)

print(pop_global_metrics)

Popularity Recommendation Model Loaded...
Running Evaluation for Popularity Recommendation Model


100%|██████████| 3000/3000 [00:16<00:00, 177.59it/s]

processed 3000 users
Finished Popularity Recommendation Model Evaluation...
{'model_name': 'Popularity Recommendation Model', 'recall@5': 0.6, 'recall@10': 1.0, 'precision@5': 0.0002, 'precision@10': 0.00016666666666666666}





# 2- Popularity Year Based Model
This model is combining the popularity IMDB formula and then adding a year factor using the following equation:
popularity = (v/(v+m) * r) + (m/(m+v) * c)
popularity_year = popularity * (book_year - oldest_book_year)
where:
v is the number of ratings for the book.
m is the minimum rating required to be listed in the chart.
r is the average rating of the book.
c is the mean ratings across the whole books.

In [15]:
popularity_year_rec_model = PopularityYearRecommendationModel(
    books, sun_train_popularity_data, threshold=5
)
print(f"{popularity_year_rec_model.MODEL_NAME} Loaded...")

pop_year_global_metrics, _ = popularity_evaluator.evaluate_model(popularity_year_rec_model)

print(pop_year_global_metrics)

Popularity Year Based Recommendation Model Loaded...
Running Evaluation for Popularity Year Based Recommendation Model


100%|██████████| 3000/3000 [00:17<00:00, 172.24it/s]

processed 3000 users
Finished Popularity Year Based Recommendation Model Evaluation...
{'model_name': 'Popularity Year Based Recommendation Model', 'recall@5': 0.5714285714285714, 'recall@10': 1.0, 'precision@5': 0.0002666666666666667, 'precision@10': 0.00023333333333333333}





# Content Based Recommendation Model

## Evaluator
we will need to use a smaller test set because the model is slower, didn't have time to optimize this one :(.

In [16]:
content_data = get_users_subset(full_train_data, full_test_data, 1000)
sub_train_content_data = content_data["train_data"]
sub_test_content_data = content_data["test_data"]

content_evaluator = ModelEvaluator(
    training_data=sub_train_content_data,
    testing_data=sub_test_content_data,
    favourite_threshold=5,
)

# 3- Content Based Recommendation Model (Publisher)

In [17]:
pub_content_rec_model = ContentBasedRecommendationModel(
    column_name="publisher", books_df=books, ratings_df=sub_train_content_data
)

pub_content_global_metrics, _ = content_evaluator.evaluate_model(pub_content_rec_model)

print(pub_content_global_metrics)

  self.books_features = self.books_df[self.features_names]


Running Evaluation for Content Based Model


100%|██████████| 1000/1000 [02:15<00:00,  7.39it/s]

processed 1000 users
Finished Content Based Model Evaluation...
{'model_name': 'Content Based Model', 'recall@5': 0.7142857142857143, 'recall@10': 1.0, 'precision@5': 0.001, 'precision@10': 0.0007030936118923262}





# 4- Content Based Recommendation Model (Author)

In [18]:
auth_content_rec_model = ContentBasedRecommendationModel(
    column_name="author", books_df=books, ratings_df=sub_train_content_data
)

auth_content_global_metrics, _ = content_evaluator.evaluate_model(auth_content_rec_model)

print(auth_content_global_metrics)

Running Evaluation for Content Based Model


100%|██████████| 1000/1000 [01:55<00:00,  8.68it/s]

processed 1000 users
Finished Content Based Model Evaluation...
{'model_name': 'Content Based Model', 'recall@5': 0.125, 'recall@10': 1.0, 'precision@5': 0.00020008003201280514, 'precision@10': 0.0008081624406505708}





# 5- Collaborative Recommendation Model

In [19]:
# similar to reducing the number of irrelevant authors/publishers
# we are going to reduce the number of books, we will be loading around 460 books.
collab_ratings = drop_irrelevant_books(ratings, threshold=200)

full_collab_train_data, full_collab_test_data = train_test_split(
    collab_ratings, test_size=0.15
)

full_collab_train_data = preprocess_ratings(full_collab_train_data)
full_collab_test_data = preprocess_ratings(full_collab_test_data)

collab_data = get_users_subset(
    full_collab_train_data, full_collab_test_data, 3000
)

collab_evaluator = ModelEvaluator(
    training_data=collab_data["train_data"],
    testing_data=collab_data["test_data"],
    favourite_threshold=1,
)

collaborative_rec_model = CollaborativeRecommendationModel(
    ratings_df=collab_ratings
)

collab_global_metrics, _ = collab_evaluator.evaluate_model(
    collaborative_rec_model
)

print(collab_global_metrics)

Running Evaluation for Collaborative Filtering


100%|██████████| 3000/3000 [00:15<00:00, 192.12it/s]

processed 3000 users
Finished Collaborative Filtering Evaluation...
{'model_name': 'Collaborative Filtering', 'recall@5': 0.7333333333333333, 'recall@10': 1.0, 'precision@5': 0.04326666666666667, 'precision@10': 0.0295}





# 6- Hybrid Recommendation Model

In [20]:
hybrid_train_data, _ = intersect_df(
    content_data["train_data"], collab_data["train_data"], "book_id"
)

hybrid_test_data, _ = intersect_df(
    content_data["test_data"], collab_data["test_data"], "book_id"
)

hybrid_evaluator = ModelEvaluator(
    training_data=hybrid_train_data,
    testing_data=hybrid_test_data,
    favourite_threshold=1,
)

hybrid_rec_model = HybridRecommendationModel(
    content_column_name="author",
    content_books_df=books,
    content_ratings_df=content_data["train_data"],
    collab_ratings_df=collab_ratings
)

hybrid_global_metrics, _ = hybrid_evaluator.evaluate_model(
    hybrid_rec_model
)

print(hybrid_global_metrics)

Running Evaluation for Hybrid Recommendation Model


100%|██████████| 343/343 [00:41<00:00,  8.27it/s]

processed 343 users
Finished Hybrid Recommendation Model Evaluation...
{'model_name': 'Hybrid Recommendation Model', 'recall@5': 0.0, 'recall@10': 1.0, 'precision@5': 0.0, 'precision@10': 0.016622922134733157}



