<a href="https://colab.research.google.com/github/ZilipF/DataMining/blob/main/FilmRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# SMART MOVIE RECOMMENDER
# Author: Filip Zalewski
!pip install ipywidgets
!pip install -q implicit
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix
import implicit
# STEP 1: Load and Clean Data
# Load movie and rating datasets
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

# Filter out ratings for movies that don't exist in the movies dataset
valid_movie_ids = set(movies_df['movieId'])
ratings_df = ratings_df[ratings_df['movieId'].isin(valid_movie_ids)]

# Extract release year from movie titles
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)')
movies_df['year'] = pd.to_numeric(movies_df['year'], errors='coerce').astype('Int64')

# Clean movie titles: remove year, punctuation, and leading/trailing spaces
movies_df['clean_title'] = (
    movies_df['title']
    .str.replace(r'\(\d{4}\)', '', regex=True)  # remove year in parentheses
    .str.replace(r'[^\w\s]', '', regex=True)    # remove punctuation
    .str.strip()                                # remove leading/trailing spaces
)

# Optional: ensure genre column is a string (used later for filtering)
movies_df['genres_str'] = movies_df['genres'].fillna('')

# Merge ratings with movie metadata into a single dataframe
full_df = pd.merge(ratings_df, movies_df, on="movieId")


# STEP 2: ALS Matrix Construction

# Create mappings for user and movie IDs to matrix indices
user_map = {uid: idx for idx, uid in enumerate(ratings_df['userId'].unique())}
movie_map = {mid: idx for idx, mid in enumerate(ratings_df['movieId'].unique())}
reverse_movie_map = {v: k for k, v in movie_map.items()}  # for converting back to original IDs

# Convert ratings to a sparse matrix in COO format
row = ratings_df['userId'].map(user_map)
col = ratings_df['movieId'].map(movie_map)
# Normalize ratings to be positive (important for implicit library)
data = ratings_df['rating'] - ratings_df['rating'].min() + 1
ratings_sparse = coo_matrix((data, (row, col)))

# Convert to CSR format for efficient row slicing
ratings_sparse_csr = ratings_sparse.tocsr()

# Initialize and train the ALS model using the 'implicit' library
als_model = implicit.als.AlternatingLeastSquares(
    factors=50,            # Number of latent features
    regularization=0.1,    # Regularization parameter
    iterations=20          # Number of training iterations
)

# Fit the ALS model to the training data
als_model.fit(ratings_sparse_csr)


# STEP 3: Recommendation Functions

def get_user_profile(user_id):
    """
    Returns a user's profile including:
    - all ratings,
    - average rating per genre,
    - average rating per year.
    """
    user_data = full_df[full_df['userId'] == user_id]

    # Expand genres into individual rows
    genres_expanded = user_data.assign(genres=user_data['genres'].str.split('|')).explode('genres')

    # Aggregate genre-based statistics (count and mean rating)
    genre_stats = genres_expanded.groupby('genres')['rating'].agg(['count', 'mean']).round(2)

    # Aggregate year-based statistics
    year_stats = user_data.groupby('year')['rating'].agg(['count', 'mean']).round(2)

    return {
        'data': user_data,
        'genre_stats': genre_stats,
        'year_stats': year_stats
    }


def user_user_recommend(user_id, top_n=10):
    """
    Recommends movies based on cosine similarity with other users.
    Returns top N movies not yet rated by the target user.
    Scores are normalized to percentage scale (0–100).
    """
    # Create user-item rating matrix
    pivot = full_df.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

    # Compute cosine similarity between users
    similarities = cosine_similarity(pivot)
    sim_df = pd.DataFrame(similarities, index=pivot.index, columns=pivot.index)

    # Get similar users (excluding the target user)
    sims = sim_df[user_id].drop(user_id).sort_values(ascending=False)

    # Weighted average prediction of ratings based on similar users
    pred_scores = pd.Series(dtype=float)
    for sim_user, sim_value in sims.items():
        pred_scores = pred_scores.add(pivot.loc[sim_user] * sim_value, fill_value=0)

    # Normalize by total similarity weights
    norm_factor = sims.sum()
    predicted_ratings = pred_scores / norm_factor

    # Remove movies already rated by the user
    watched = pivot.loc[user_id][pivot.loc[user_id] > 0].index
    predicted_ratings = predicted_ratings.drop(watched, errors='ignore')

    # Prepare final DataFrame
    recommendations = (
        predicted_ratings.reset_index(name='score')
        .merge(movies_df, on='movieId')
        .sort_values('score', ascending=False)
        .head(top_n)
    )

    # Convert scores to percentages
    max_score = recommendations['score'].max()
    if max_score > 0:
        recommendations['score'] = (recommendations['score'] / max_score * 100).round(1)
        recommendations.rename(columns={'score': 'score %'}, inplace=True)

    return recommendations[['movieId', 'clean_title', 'year', 'score %']]


def als_recommend(user_id, top_n=10):
    """
    Recommends movies using the trained ALS model.
    Returns top N recommendations for a given user.
    Scores are normalized to percentage scale (0–100).
    """
    try:
        uid = user_map[int(user_id)]
        user_row = ratings_sparse_csr[uid]
        result = als_model.recommend(uid, user_row, N=top_n)

        item_ids = [r[0] for r in result]
        scores = [r[1] for r in result]

        # Convert ALS item indices back to movie IDs
        movie_ids = [reverse_movie_map.get(i) for i in item_ids]
        movie_info = movies_df.set_index('movieId').reindex(movie_ids)
        movie_info = movie_info[movie_info['title'].notnull()]
        scores_filtered = [s for i, s in zip(movie_ids, scores) if i in movie_info.index]

        # Build final result DataFrame
        df = pd.DataFrame({
            'title': movie_info['title'].values,
            'score': scores_filtered
        })

        # Convert scores to percentages
        max_score = df['score'].max()
        if max_score > 0:
            df['score'] = (df['score'] / max_score * 100).round(1)
            df.rename(columns={'score': 'score %'}, inplace=True)

        return df

    except Exception as e:
        return pd.DataFrame({'error': [str(e)]})


# STEP 4: User Interface

# Dropdown: Select user ID
select_user_id = widgets.Dropdown(
    options=sorted(ratings_df['userId'].unique()),
    description='User:'
)

# Radio buttons: Choose recommendation mode
select_rec_type = widgets.RadioButtons(
    options=[
        'Favorite genres',
        'Selected genres',
        'Year range',
        'Top rated overall',
        'Based on similar users',
        'ALS model'
    ],
    description='Mode:'
)

# Multiselect: Choose genres (for filtered mode)
select_genres = widgets.SelectMultiple(
    options=sorted(set('|'.join(movies_df['genres'].dropna()).split('|'))),
    description='Genres:'
)

# Range slider: Choose year interval
select_years = widgets.IntRangeSlider(
    value=[1990, 2010],
    min=int(movies_df['year'].dropna().min()),
    max=int(movies_df['year'].dropna().max()),
    description='Year range:'
)

# Sliders for recommendation count and filtering
select_top_n = widgets.IntSlider(value=10, min=5, max=20, description='Top N:')
min_votes_slider = widgets.IntSlider(value=20, min=1, max=100, description='Min votes:')

# Output areas
output_box = widgets.Output()
label_box = widgets.Output()
export_button = widgets.Button(description="Export to CSV")

# UI display logic
def update_visibility(change=None):
    mode = select_rec_type.value
    select_genres.layout.display = 'block' if mode == 'Selected genres' else 'none'
    select_years.layout.display = 'block' if mode == 'Year range' else 'none'
    min_votes_slider.layout.display = 'block' if mode in [
        'Favorite genres', 'Selected genres', 'Year range', 'Top rated overall'] else 'none'

# Main recommendation function triggered by UI
def generate_recommendations(change=None):
    with output_box:
        clear_output()
        user_id = select_user_id.value
        profile = get_user_profile(user_id)
        seen = profile['data']['movieId'].unique()
        candidates = full_df[~full_df['movieId'].isin(seen)]

        label_box.clear_output()
        with label_box:
            print(f"Recommendations for user {user_id} — mode: {select_rec_type.value}")

        # Filter logic for each mode
        if select_rec_type.value == 'Favorite genres':
            top_genres = profile['genre_stats'].sort_values('mean', ascending=False).head(2).index
            filtered = candidates[candidates['genres'].str.contains('|'.join(top_genres))]
        elif select_rec_type.value == 'Selected genres':
            filtered = candidates[candidates['genres'].str.contains('|'.join(select_genres.value))]
        elif select_rec_type.value == 'Year range':
            start, end = select_years.value
            filtered = candidates[candidates['year'].between(start, end)]
        elif select_rec_type.value == 'Top rated overall':
            filtered = candidates
        elif select_rec_type.value == 'Based on similar users':
            df = user_user_recommend(user_id, select_top_n.value)
            display(df[['clean_title', 'year', 'score %']] if 'clean_title' in df else df)
            export_button.on_click(lambda b: df.to_csv("recommendations_user_user.csv", index=False))
            return
        elif select_rec_type.value == 'ALS model':
            df = als_recommend(user_id, select_top_n.value)
            display(df)
            export_button.on_click(lambda b: df.to_csv("recommendations_ALS.csv", index=False))
            return

        # Aggregate and sort filtered results
        top = (
            filtered.groupby(['movieId', 'clean_title', 'year'])
            .agg(AvgRating=('rating', 'mean'), Votes=('rating', 'count'))
            .query(f'Votes >= {min_votes_slider.value}')
            .sort_values(['AvgRating', 'Votes'], ascending=[False, False])
            .reset_index().head(select_top_n.value)
            .round(2)
        )
        top.index = top.index + 1  # Start index at 1 instead of 0
        display(top)
        export_button.on_click(lambda b: top.to_csv("recommendations_top.csv", index=False))


# STEP 5: Simple UI layout

# Container for dynamically updated controls
dynamic_controls = widgets.VBox([])

# Update layout of UI controls based on selected recommendation mode
def update_ui_layout(change=None):
    controls = [select_user_id, select_rec_type]

    # Add genre selector if in 'Selected genres' mode
    if select_rec_type.value == 'Selected genres':
        controls.append(select_genres)
    # Add year slider if in 'Year range' mode
    elif select_rec_type.value == 'Year range':
        controls.append(select_years)

    # Show vote filter for rating-based modes
    if select_rec_type.value in [
        'Favorite genres', 'Selected genres', 'Year range', 'Top rated overall'
    ]:
        controls.append(min_votes_slider)

    # Always show top N slider and export button
    controls.append(select_top_n)
    controls.append(export_button)

    # Stack widgets vertically with padding
    dynamic_controls.children = [
        widgets.VBox(controls, layout=widgets.Layout(padding='10px'))
    ]

    # Refresh recommendations on layout change
    generate_recommendations()

# Attach observers to all interactive widgets
select_user_id.observe(update_ui_layout, names='value')
select_rec_type.observe(update_ui_layout, names='value')
select_genres.observe(update_ui_layout, names='value')
select_years.observe(update_ui_layout, names='value')
select_top_n.observe(update_ui_layout, names='value')
min_votes_slider.observe(update_ui_layout, names='value')

# Trigger initial UI setup
update_ui_layout()

# Display full interface: left (controls), right (labels + results)
ui_layout = widgets.HBox([
    dynamic_controls,
    widgets.VBox([label_box, output_box])
])

display(ui_layout)



  0%|          | 0/20 [00:00<?, ?it/s]

HBox(children=(VBox(children=(VBox(children=(Dropdown(description='User:', options=(np.int64(1), np.int64(2), …