<a href="https://colab.research.google.com/github/armangupta910/Movie-Recommendor-System/blob/main/MovieSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset Loading


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Replace the URL below with the raw URL of your CSV file
url = 'https://raw.githubusercontent.com/armangupta910/Movie-Recommendor-System/main/dataSet/complete_data%20-%20complete_data%20(1).csv.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(url)

# Display the first few rows of the DataFrame
display(df.head())

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,tag,timestamp_y
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,


Implementation of SVR from scratch


(Issue-Large Running Time--no optimization)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the MovieLens dataset
url = 'https://raw.githubusercontent.com/armangupta910/Movie-Recommendor-System/main/dataSet/complete_data%20-%20complete_data%20(1).csv.csv'
df = pd.read_csv(url)

# Consider only the relevant columns
df = df[['userId', 'movieId', 'rating']]
df_subset = df.head(10000)

# Preprocess data
# In this case, preprocessing mainly involves converting categorical variables into numerical representations
df_subset['userId'] = pd.factorize(df_subset['userId'])[0]
df_subset['movieId'] = pd.factorize(df_subset['movieId'])[0]

# Split data into features and target variable
X = df_subset[['userId', 'movieId']]
y = df_subset['rating']


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement SVR from scratch
class SVRFromScratch:
    def __init__(self, epsilon=0.1, C=1.0, kernel='linear', max_iter=1000):
        self.epsilon = epsilon
        self.C = C
        self.kernel = kernel
        self.max_iter = max_iter

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

        n_samples, n_features = X.shape

        # Initialize alpha, b and error cache
        self.alpha = np.zeros(n_samples)
        self.b = 0
        self.error_cache = np.zeros(n_samples)

        # Kernel matrix
        self.K = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                self.K[i, j] = self._kernel_function(X[i], X[j])

        # Training loop
        for _ in range(self.max_iter):
            for i in range(n_samples):
                self._train_step(i)

    def predict(self, X):
        n_samples = X.shape[0]
        predictions = np.zeros(n_samples)
        for i in range(n_samples):
            predictions[i] = self._predict_single(X[i])
        return predictions

    def _train_step(self, i):
        Ei = self._predict_single(self.X_train[i]) - self.y_train[i]
        self.error_cache[i] = Ei

        # Update alpha
        if (self.y_train[i] * Ei < -self.epsilon and self.alpha[i] < self.C) or \
                (self.y_train[i] * Ei > self.epsilon and self.alpha[i] > 0):
            j = self._select_random_j(i)
            Ej = self._predict_single(self.X_train[j]) - self.y_train[j]

            alpha_i_old = self.alpha[i]
            alpha_j_old = self.alpha[j]

            L, H = self._compute_L_H(self.y_train[i], self.y_train[j], alpha_i_old, alpha_j_old)

            if L == H:
                return

            eta = 2.0 * self.K[i, j] - self.K[i, i] - self.K[j, j]
            if eta >= 0:
                return

            alpha_j_new = alpha_j_old - (self.y_train[j] * (Ei - Ej)) / eta
            alpha_j_new = min(H, alpha_j_new)
            alpha_j_new = max(L, alpha_j_new)

            if abs(alpha_j_new - alpha_j_old) < 0.00001:
                return

            alpha_i_new = alpha_i_old + self.y_train[i] * self.y_train[j] * (alpha_j_old - alpha_j_new)

            b1 = self.b - Ei - self.y_train[i] * (alpha_i_new - alpha_i_old) * self.K[i, i] - \
                 self.y_train[j] * (alpha_j_new - alpha_j_old) * self.K[i, j]
            b2 = self.b - Ej - self.y_train[i] * (alpha_i_new - alpha_i_old) * self.K[i, j] - \
                 self.y_train[j] * (alpha_j_new - alpha_j_old) * self.K[j, j]

            if 0 < alpha_i_new < self.C:
                self.b = b1
            elif 0 < alpha_j_new < self.C:
                self.b = b2
            else:
                self.b = (b1 + b2) / 2.0

            self.alpha[i] = alpha_i_new
            self.alpha[j] = alpha_j_new

    def _predict_single(self, x):
        prediction = self.b
        for i in range(len(self.alpha)):
            prediction += self.alpha[i] * self.y_train[i] * self._kernel_function(self.X_train[i], x)
        return prediction

    def _kernel_function(self, x1, x2):
        if self.kernel == 'linear':
            return np.dot(x1, x2)
        elif self.kernel == 'rbf':
            sigma = 1.0  # You can experiment with different values for the RBF kernel
            return np.exp(-np.linalg.norm(x1 - x2) ** 2 / (2 * (sigma ** 2)))
        else:
            raise ValueError("Unsupported kernel type")

    def _select_random_j(self, i):
        j = i
        while j == i:
            j = np.random.randint(0, len(self.alpha))
        return j

    def _compute_L_H(self, yi, yj, alpha_i, alpha_j):
        if yi != yj:
            return max(0, alpha_j - alpha_i), min(self.C, self.C + alpha_j - alpha_i)
        else:
            return max(0, alpha_i + alpha_j - self.C), min(self.C, alpha_i + alpha_j)

# Train SVR model
svr_model = SVRFromScratch(kernel='linear', C=1.0)
svr_model.fit(X_train.to_numpy(), y_train.to_numpy())

# Predict ratings on test set
y_pred = svr_model.predict(X_test.to_numpy())

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)



After doing Dimensionality reduction also,the problem persists!

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.decomposition import PCA
from joblib import Parallel, delayed

# Load the MovieLens dataset
url = 'https://raw.githubusercontent.com/armangupta910/Movie-Recommendor-System/main/dataSet/complete_data%20-%20complete_data%20(1).csv.csv'
df = pd.read_csv(url)

# Consider only a subset of the data (e.g., first 10000 rows)
df_subset = df.head(10000).copy()  # Use .copy() to avoid modifying the original DataFrame

# Preprocess data using .loc
df_subset.loc[:, 'userId'] = pd.factorize(df_subset['userId'])[0]
df_subset.loc[:, 'movieId'] = pd.factorize(df_subset['movieId'])[0]

# Split data into features and target variable
X = df_subset[['userId', 'movieId']]
y = df_subset['rating']

# Apply PCA to reduce dimensionality
pca = PCA(n_components=2)  # Specify the desired number of components
X_pca = pca.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Define SVR model initialization and training function
def train_svr(X_train, y_train):
    svr_model = SVR(kernel='linear', C=1.0)
    svr_model.fit(X_train, y_train)
    return svr_model

# Initialize SVR models in parallel
svr_models = Parallel(n_jobs=-1)(delayed(train_svr)(X_train, y_train) for _ in range(10))

# Predict ratings on the test set using all SVR models
y_preds = [svr_model.predict(X_test) for svr_model in svr_models]

# Calculate ensemble predictions as the average of individual predictions
y_pred_ensemble = sum(y_preds) / len(y_preds)

# Evaluate ensemble model
mse = mean_squared_error(y_test, y_pred_ensemble)
mae = mean_absolute_error(y_test, y_pred_ensemble)
r2 = r2_score(y_test, y_pred_ensemble)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)


Finally!!
Using a combination of Filtering Techniques and SVR

SVR is implemenated using sklearn library beacuse:

 Using optimized libraries and implementations of SVR, such as those provided by scikit-learn or other machine learning frameworks, which are designed to efficiently handle large datasets and leverage parallel processing capabilities.

In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the MovieLens dataset
url = 'https://raw.githubusercontent.com/armangupta910/Movie-Recommendor-System/main/dataSet/complete_data%20-%20complete_data%20(1).csv.csv'
df = pd.read_csv(url)

# Create user-item matrix
user_item_matrix = pd.pivot_table(df, index='userId', columns='movieId', values='rating')
# Handle missing values
user_item_matrix = user_item_matrix.fillna(0)

# Collaborative Filtering
# User-Based CF
user_cf_model = NearestNeighbors(metric='cosine', algorithm='brute')
user_cf_model.fit(user_item_matrix.values)

# Item-Based CF
item_cf_model = NearestNeighbors(metric='cosine', algorithm='brute')
item_cf_model.fit(user_item_matrix.T.values)

# Content-Based Filtering
# Feature Extraction (using movie titles for simplicity)
count_vectorizer = CountVectorizer(stop_words='english')
movie_title_matrix = count_vectorizer.fit_transform(df['title'])

# Latent Dirichlet Allocation
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
movie_topics = lda_model.fit_transform(movie_title_matrix)

# Profile Building (using average ratings for simplicity)
user_profiles = df.groupby('userId')['rating'].mean()

# Now you can use user_cf_model, item_cf_model, movie_topics, and user_profiles for recommendations
X_train, X_test, y_train, y_test = train_test_split(user_item_matrix.values, user_item_matrix.values, test_size=0.2, random_state=42)

# Flatten the user-item matrices for SVR
X_train_flat = X_train.flatten()
X_test_flat = X_test.flatten()
y_train_flat = y_train.flatten()
y_test_flat = y_test.flatten()

# Initialize SVR model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.2)

# Train SVR model
svr_model.fit(X_train_flat.reshape(-1, 1), y_train_flat)

# Make predictions
svr_predictions = svr_model.predict(X_test_flat.reshape(-1, 1))

# Evaluate SVR model
mse = mean_squared_error(y_test_flat, svr_predictions)
mae = mean_absolute_error(y_test_flat, svr_predictions)
r2 = r2_score(y_test_flat, svr_predictions)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)



Mean Squared Error: 0.04008639806257075
Mean Absolute Error: 0.20003309354859022
R^2 Score: 0.8635910769608134


Peforming Cross Validation

In [None]:
#cross validation
from surprise.model_selection import cross_validate

# Load data into Surprise format
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

# Define collaborative filtering algorithm
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)

# Perform cross-validation
cv_results = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

# Display cross-validation results
print("Cross-validation results:")
for key, value in cv_results.items():
    print(key, ':', value)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9713  0.9682  0.9615  0.9724  0.9664  0.9680  0.0039  
Fit time          0.31    0.20    0.23    0.22    0.28    0.25    0.04    
Test time         2.54    1.82    1.45    2.11    1.75    1.93    0.37    
Cross-validation results:
test_rmse : [0.97128527 0.96818571 0.96153045 0.97238633 0.96641928]
fit_time : (0.3117184638977051, 0.20479917526245117, 0.22610211372375488, 0.22267413139343262, 0.2798731327056885)
test_time : (2.538271188735962, 1.820768117904663, 1.448

Use Functionality-

Taking input of user IDs and predicting movies suitable to the user based on his/her ratings

In [None]:
!pip install scikit-surprise
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

def recommend_movies(user_ids, user_item_matrix, movie_topics, user_profiles, num_recommendations=5):
    recommended_movies = pd.DataFrame(columns=['userId', 'movieId', 'title'])

    for user_id in user_ids:
        # Check if the user exists in the user-item matrix
        if user_id not in user_item_matrix.index:
            print(f"User ID {user_id} not found.")
            continue

        # Load data into Surprise format
        reader = Reader(rating_scale=(0.5, 5))
        data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

        # Train-test split
        trainset, _ = train_test_split(data, test_size=0.2, random_state=42)

        # Train collaborative filtering model
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNBasic(sim_options=sim_options)
        algo.fit(trainset)

        # Get top N recommendations
        user_movies = df[df['userId'] == user_id]['movieId']
        unrated_movies = user_item_matrix.columns[~user_item_matrix.columns.isin(user_movies)]
        testset = [[user_id, movie_id, 4] for movie_id in unrated_movies]
        predictions = algo.test(testset)
        top_recs = sorted(predictions, key=lambda x: x.est, reverse=True)[:num_recommendations]
        top_movie_ids = [rec.iid for rec in top_recs]

        # Remove duplicates
        top_movie_ids = list(set(top_movie_ids))

        # Add recommendations to the dataframe
        recommended_movies = pd.concat([recommended_movies, df[df['movieId'].isin(top_movie_ids)][['userId', 'movieId', 'title']]])

    return recommended_movies

# Example usage:
user_ids = [2]  # Example user IDs, you can pass multiple user IDs here
recommended_movies = recommend_movies(user_ids, user_item_matrix, movie_topics, user_profiles)
print("Recommended Movies:")
print(recommended_movies)
print('-----------------------------------------------------------------------------------')

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

def recommend_movies_by_ratings(user_id, df, num_recommendations=5):
    # Load data into Surprise format
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

    # Train-test split
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

    # Train collaborative filtering model
    algo = SVD()
    algo.fit(trainset)

    # Get user's rated movies
    user_movies = df[df['userId'] == user_id]['movieId']

    # Get unrated movies
    unrated_movies = df['movieId'][~df['movieId'].isin(user_movies)]

    # Predict ratings for unrated movies
    predictions = algo.test([(user_id, movie_id, 3) for movie_id in unrated_movies])

    # Sort predictions by estimated rating and select top N
    top_recs = sorted(predictions, key=lambda x: x.est, reverse=True)[:num_recommendations]
    top_movie_ids = [rec.iid for rec in top_recs]

    # Add recommendations to the dataframe
    recommended_movies = df[df['movieId'].isin(top_movie_ids)][['userId', 'movieId', 'title']]
    recommended_movies['userId'] = user_id

    return recommended_movies

# Example usage:
user_id = 2 # Example user ID
recommended_movies = recommend_movies_by_ratings(user_id, df)
print("Recommended Movies:")
print(recommended_movies)
print('-----------------------------------------------------------------------------------')


Computing the cosine similarity matrix...
Done computing similarity matrix.
Recommended Movies:
      userId movieId                            title
516      424      47      Seven (a.k.a. Se7en) (1995)
517      424      47      Seven (a.k.a. Se7en) (1995)
536      474      47      Seven (a.k.a. Se7en) (1995)
723      424      50       Usual Suspects, The (1995)
724      424      50       Usual Suspects, The (1995)
725      424      50       Usual Suspects, The (1995)
726      424      50       Usual Suspects, The (1995)
727      424      50       Usual Suspects, The (1995)
743      474      50       Usual Suspects, The (1995)
33726    424     628               Primal Fear (1996)
33727    424     628               Primal Fear (1996)
33728    424     628               Primal Fear (1996)
33729    424     628               Primal Fear (1996)
33730    424     628               Primal Fear (1996)
33735    474     628               Primal Fear (1996)
40917     62    2023  Godfather: Part II

Output with Feedback Mechanism


In [None]:
!pip install scikit-surprise
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

def recommend_movies(user_ids, user_item_matrix, movie_topics, user_profiles, num_recommendations=5):
    recommended_movies = pd.DataFrame(columns=['userId', 'movieId', 'title'])

    for user_id in user_ids:
        # Check if the user exists in the user-item matrix
        if user_id not in user_item_matrix.index:
            print(f"User ID {user_id} not found.")
            continue

        # Load data into Surprise format
        reader = Reader(rating_scale=(0.5, 5))
        data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

        # Train-test split
        trainset, _ = train_test_split(data, test_size=0.2, random_state=42)

        # Train collaborative filtering model
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNBasic(sim_options=sim_options)
        algo.fit(trainset)

        # Get top N recommendations
        user_movies = df[df['userId'] == user_id]['movieId']
        unrated_movies = user_item_matrix.columns[~user_item_matrix.columns.isin(user_movies)]
        testset = [[user_id, movie_id, 4] for movie_id in unrated_movies]
        predictions = algo.test(testset)
        top_recs = sorted(predictions, key=lambda x: x.est, reverse=True)[:num_recommendations]
        top_movie_ids = [rec.iid for rec in top_recs]

        # Remove duplicates
        top_movie_ids = list(set(top_movie_ids))

        # Add recommendations to the dataframe
        recommended_movies = pd.concat([recommended_movies, df[df['movieId'].isin(top_movie_ids)][['userId', 'movieId', 'title']]])

    # Group by title and aggregate other columns
    recommended_movies = recommended_movies.groupby('title').agg({'userId': 'first', 'movieId': 'first'}).reset_index()

    return recommended_movies

# Prompt user for input
user_input = input("Enter one or more user IDs separated by commas (e.g., '1, 2, 3'): ")
user_ids = [int(id.strip()) for id in user_input.split(',')]

# Get recommended movies
recommended_movies = recommend_movies(user_ids, user_item_matrix, movie_topics, user_profiles)

# Display recommended movies
print("Recommended Movies:")
print(recommended_movies)

# Feedback Loop
def collect_user_feedback(user_ratings, user_item_matrix):
    # Update user-item matrix with new ratings
    for movie_id, rating in user_ratings.items():
        user_item_matrix.loc['new_user', movie_id] = rating
    return user_item_matrix

# Model Tuning
def retrain_model(user_item_matrix):
    # Train collaborative filtering model with updated data
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(user_item_matrix.stack().reset_index(), reader)
    trainset = data.build_full_trainset()
    algo = KNNBasic()
    algo.fit(trainset)
    return algo

# Prompt user for ratings feedback
user_ratings_feedback = input("Provide ratings feedback for recommended movies (movieId:rating, separated by comma): ")
user_ratings_dict = dict([pair.split(':') for pair in user_ratings_feedback.split(',')])

# Collect user feedback and update user-item matrix
user_item_matrix_updated = collect_user_feedback(user_ratings_dict, user_item_matrix)

# Retrain the model with updated data
algo_updated = retrain_model(user_item_matrix_updated)

# Get recommended movies for the user
recommended_movies_updated = recommend_movies([user_id], user_item_matrix_updated, movie_topics, user_profiles)

# Display updated recommended movies
print("Updated Recommended Movies:")
print(recommended_movies_updated)




Enter one or more user IDs separated by commas (e.g., '1, 2, 3'): 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
Recommended Movies:
                             title userId movieId
0        Addams Family, The (1991)     62    2124
1  Godfather: Part III, The (1990)     62    2023
2               Primal Fear (1996)    424     628
3      Seven (a.k.a. Se7en) (1995)    424      47
4       Usual Suspects, The (1995)    424      50
Provide ratings feedback for recommended movies (movieId:rating, separated by comma): 2124:0,2023:0,628:0,47:0,50:0
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Updated Recommended Movies:
                             title userId movieId
0        Addams Family, The (1991)     62    2124
1  Godfather: Part III, The (1990)     62    2023
2               Primal Fear (1996)    424     628
3      Seven (a.k.a. Se7en) (1995)    424  