In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357282 sha256=11b4cf745de7aee2a58d7901d47c2cccc1f0c4b21d72a3338279f5dac7f0798c
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Install

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# Load and sample data
movies = pd.read_csv('/content/drive/MyDrive/mvrec/movies.csv')
ratings = pd.read_csv('/content/drive/MyDrive/mvrec/ratings.csv')

# TF-IDF Vectorizer for genres with sparse output
tfidf = TfidfVectorizer(stop_words='english')
movies['genres'] = movies['genres'].fillna('')

tfidf_matrix = tfidf.fit_transform(movies['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Prepare the data for Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Use SVD algorithm with reduced factors to save memory
svd = SVD(n_factors=50)  # Reduce the number of factors for memory efficiency
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7dd29747f820>

In [4]:
import re

# Assuming your titles are in the format "Movie Title (Year)"
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return int(match.group(1))
    return None

# Apply the extraction function
movies['year'] = movies['title'].apply(extract_year)

# Check if the 'year' column was successfully added
print(movies[['title', 'year']].head())


                                title    year
0                    Toy Story (1995)  1995.0
1                      Jumanji (1995)  1995.0
2             Grumpier Old Men (1995)  1995.0
3            Waiting to Exhale (1995)  1995.0
4  Father of the Bride Part II (1995)  1995.0


In [5]:
def hybrid_recommendations(user_id, movie_id, movies_df, svd_model, cosine_sim_matrix):
    # Check if the movie_id exists in the dataset
    if movie_id not in movies_df['movieId'].values:
        return f"Error: Movie ID '{movie_id}' not found in the dataset."

    # Find the index of the movie in the DataFrame
    idx = movies_df[movies_df['movieId'] == movie_id].index[0]

    # Calculate similarity scores
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]

    # Get the movie IDs for the similar movies
    similar_movie_ids = movies_df.iloc[movie_indices]['movieId']

    # Collaborative filtering scores
    cf_scores = [svd_model.predict(user_id, mid).est for mid in similar_movie_ids]

    # Combine the results
    hybrid_scores = [(movies_df['title'].iloc[i], cf_scores[j], sim_scores[j][1])
                     for j, i in enumerate(movie_indices)]
    hybrid_scores = sorted(hybrid_scores, key=lambda x: (x[1], x[2]), reverse=True)

    return hybrid_scores

# Example usage:
user_id = 1
movie_id = 5  # Use the movie ID here
recommendations = hybrid_recommendations(user_id, movie_id, movies, svd, cosine_sim)

if isinstance(recommendations, str):  # Check if an error message was returned
    print(recommendations)
else:
    for title, cf_score, sim_score in recommendations:
        print(f"Movie: {title}, CF Score: {cf_score:.2f}, Similarity: {sim_score:.2f}")


Movie: Friday (1995), CF Score: 4.68, Similarity: 1.00
Movie: Flirting With Disaster (1996), CF Score: 4.43, Similarity: 1.00
Movie: Four Rooms (1995), CF Score: 4.42, Similarity: 1.00
Movie: Happy Gilmore (1996), CF Score: 4.27, Similarity: 1.00
Movie: Steal Big, Steal Little (1995), CF Score: 4.27, Similarity: 1.00
Movie: Black Sheep (1996), CF Score: 4.22, Similarity: 1.00
Movie: Down Periscope (1996), CF Score: 3.98, Similarity: 1.00
Movie: Mr. Wrong (1996), CF Score: 3.97, Similarity: 1.00
Movie: Ace Ventura: When Nature Calls (1995), CF Score: 3.52, Similarity: 1.00
Movie: Bio-Dome (1996), CF Score: 3.42, Similarity: 1.00


In [12]:
import numpy as np

def calculate_year_weight(year, min_year, max_year):
    # Normalize the year between 0 and 1
    normalized_year = (year - min_year) / (max_year - min_year)

    # Apply a non-linear transformation for dynamic weighting
    # For example, squaring the normalized year gives more weight to recent movies
    year_weight = normalized_year ** 2

    return year_weight

def hybrid_recommendations(user_id, movie_id, movies_df, svd_model, cosine_sim_matrix):
    # Check if the movie_id exists in the dataset
    if movie_id not in movies_df['movieId'].values:
        return f"Error: Movie ID '{movie_id}' not found in the dataset."

    # Find the index of the movie in the DataFrame
    idx = movies_df[movies_df['movieId'] == movie_id].index[0]

    # Calculate similarity scores
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]

    # Get the movie IDs and years for the similar movies
    similar_movies = movies_df.iloc[movie_indices]
    similar_movie_ids = similar_movies['movieId']
    similar_movie_years = similar_movies['year']

    # Collaborative filtering scores
    cf_scores = [svd_model.predict(user_id, mid).est for mid in similar_movie_ids]

    # Define the range of years in the dataset
    min_year = movies_df['year'].min()
    max_year = movies_df['year'].max()

    # Combine the scores with dynamic year weighting
    hybrid_scores = []
    for j, i in enumerate(movie_indices):
        # Calculate the year weight
        year_weight = calculate_year_weight(similar_movie_years.iloc[j], min_year, max_year)

        # Final score = (CF Score * 0.5) + (Similarity Score * 0.4) + (Year Weight * 0.1)
        final_score = (cf_scores[j] * 0.5) + (sim_scores[j][1] * 0.4) + (year_weight * 0.1)

        hybrid_scores.append((movies_df['title'].iloc[i], final_score, cf_scores[j], sim_scores[j][1], year_weight))

    # Sort by final hybrid score
    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)

    return hybrid_scores

# Example usage:
user_id = 1
movie_id = 15  # Use the movie ID here
recommendations = hybrid_recommendations(user_id, movie_id, movies, svd, cosine_sim)

if isinstance(recommendations, str):  # Check if an error message was returned
    print(recommendations)
else:
    for title, final_score, cf_score, sim_score, year_weight in recommendations:
        print(f"Movie: {title}, Final Score: {final_score:.2f}, CF Score: {cf_score:.2f}, Similarity: {sim_score:.2f}, Year Weight: {year_weight:.2f}")


Movie: Adventures of Robin Hood, The (1938), Final Score: 2.72, CF Score: 4.62, Similarity: 1.00, Year Weight: 0.10
Movie: Host, The (2013), Final Score: 2.62, CF Score: 4.26, Similarity: 1.00, Year Weight: 0.92
Movie: Captain Blood (1935), Final Score: 2.60, CF Score: 4.38, Similarity: 1.00, Year Weight: 0.08
Movie: Eight Below (2006), Final Score: 2.57, CF Score: 4.23, Similarity: 0.94, Year Weight: 0.80
Movie: Helen of Troy (2003), Final Score: 2.55, CF Score: 4.21, Similarity: 0.94, Year Weight: 0.76
Movie: King Solomon's Mines (1950), Final Score: 2.55, CF Score: 4.26, Similarity: 1.00, Year Weight: 0.17
Movie: Three Musketeers, The (1948), Final Score: 2.51, CF Score: 4.24, Similarity: 0.94, Year Weight: 0.16
Movie: Three Musketeers, The (1993), Final Score: 2.40, CF Score: 3.94, Similarity: 0.93, Year Weight: 0.62
Movie: Musketeer, The (2001), Final Score: 2.39, CF Score: 3.88, Similarity: 0.94, Year Weight: 0.73
Movie: Jewel of the Nile, The (1985), Final Score: 2.30, CF Score: