<a href="https://colab.research.google.com/github/armangupta910/Movie-Recommendor-System/blob/main/MovieSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset Loading


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# URL of our CSV file
url = 'https://raw.githubusercontent.com/armangupta910/Movie-Recommendor-System/main/dataSet/complete_data%20-%20complete_data%20(1).csv.csv'

#reading the file and storing into a dataframe
df = pd.read_csv(url)

# Displaying the first few rows of the DataFrame
display(df.head())

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,tag,timestamp_y
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,


Implementation of SVR from scratch


(Issue-Large Running Time

Reason--no optimization)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Considering only the relevant columns
df = df[['userId', 'movieId', 'rating']]
df_subset = df.head(10000)

# Preprocessing data
#preprocessing mainly involves converting categorical variables into numerical representations
df_subset['userId'] = pd.factorize(df_subset['userId'])[0]
df_subset['movieId'] = pd.factorize(df_subset['movieId'])[0]

# Splitting data into features and target variable
X = df_subset[['userId', 'movieId']]
y = df_subset['rating']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Implementing SVR from scratch
class SVRFromScratch:
    def __init__(self, epsilon=0.1, C=1.0, kernel='linear', max_iter=1000):
        self.epsilon = epsilon
        self.C = C
        self.kernel = kernel
        self.max_iter = max_iter

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

        n_samples, n_features = X.shape

        #alpha, b and error cache
        self.alpha = np.zeros(n_samples)
        self.b = 0
        self.error_cache = np.zeros(n_samples)

        # Kernel matrix
        self.K = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                self.K[i, j] = self._kernel_function(X[i], X[j])

        # Training loop
        for _ in range(self.max_iter):
            for i in range(n_samples):
                self._train_step(i)

    def predict(self, X):
        n_samples = X.shape[0]
        predictions = np.zeros(n_samples)
        for i in range(n_samples):
            predictions[i] = self._predict_single(X[i])
        return predictions

    def _train_step(self, i):
        Ei = self._predict_single(self.X_train[i]) - self.y_train[i]
        self.error_cache[i] = Ei

        # Updating alpha
        if (self.y_train[i] * Ei < -self.epsilon and self.alpha[i] < self.C) or \
                (self.y_train[i] * Ei > self.epsilon and self.alpha[i] > 0):
            j = self._select_random_j(i)
            Ej = self._predict_single(self.X_train[j]) - self.y_train[j]

            alpha_i_old = self.alpha[i]
            alpha_j_old = self.alpha[j]

            L, H = self._compute_L_H(self.y_train[i], self.y_train[j], alpha_i_old, alpha_j_old)

            if L == H:
                return

            eta = 2.0 * self.K[i, j] - self.K[i, i] - self.K[j, j]
            if eta >= 0:
                return

            alpha_j_new = alpha_j_old - (self.y_train[j] * (Ei - Ej)) / eta
            alpha_j_new = min(H, alpha_j_new)
            alpha_j_new = max(L, alpha_j_new)

            if abs(alpha_j_new - alpha_j_old) < 0.00001:
                return

            alpha_i_new = alpha_i_old + self.y_train[i] * self.y_train[j] * (alpha_j_old - alpha_j_new)

            b1 = self.b - Ei - self.y_train[i] * (alpha_i_new - alpha_i_old) * self.K[i, i] - \
                 self.y_train[j] * (alpha_j_new - alpha_j_old) * self.K[i, j]
            b2 = self.b - Ej - self.y_train[i] * (alpha_i_new - alpha_i_old) * self.K[i, j] - \
                 self.y_train[j] * (alpha_j_new - alpha_j_old) * self.K[j, j]

            if 0 < alpha_i_new < self.C:
                self.b = b1
            elif 0 < alpha_j_new < self.C:
                self.b = b2
            else:
                self.b = (b1 + b2) / 2.0

            self.alpha[i] = alpha_i_new
            self.alpha[j] = alpha_j_new

    def _predict_single(self, x):
        prediction = self.b
        for i in range(len(self.alpha)):
            prediction += self.alpha[i] * self.y_train[i] * self._kernel_function(self.X_train[i], x)
        return prediction

    def _kernel_function(self, x1, x2):
        if self.kernel == 'linear':
            return np.dot(x1, x2)
        elif self.kernel == 'rbf':
            sigma = 1.0
            return np.exp(-np.linalg.norm(x1 - x2) ** 2 / (2 * (sigma ** 2)))
        else:
            raise ValueError("Unsupported kernel type")

    def _select_random_j(self, i):
        j = i
        while j == i:
            j = np.random.randint(0, len(self.alpha))
        return j

    def _compute_L_H(self, yi, yj, alpha_i, alpha_j):
        if yi != yj:
            return max(0, alpha_j - alpha_i), min(self.C, self.C + alpha_j - alpha_i)
        else:
            return max(0, alpha_i + alpha_j - self.C), min(self.C, alpha_i + alpha_j)

# Training SVR model
svr_model = SVRFromScratch(kernel='linear', C=1.0)
svr_model.fit(X_train.to_numpy(), y_train.to_numpy())

# Predicting ratings on test set
y_pred = svr_model.predict(X_test.to_numpy())

# Evaluating model by comparing predicted and actual ratings of movies
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)



After doing Dimensionality reduction also,the problem of larger execution time persists!

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.decomposition import PCA
from joblib import Parallel, delayed

# Considering only a subset of the data (e.g., first 10000 rows)
df_subset = df.head(10000).copy()  # Use .copy() to avoid modifying the original DataFrame

# Preprocessing data using .loc
df_subset.loc[:, 'userId'] = pd.factorize(df_subset['userId'])[0]
df_subset.loc[:, 'movieId'] = pd.factorize(df_subset['movieId'])[0]

# Splitting data into features and target variable
X = df_subset[['userId', 'movieId']]
y = df_subset['rating']

#  PCA to reduce dimensionality
pca = PCA(n_components=2)  # Specify the desired number of components
X_pca = pca.fit_transform(X)

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Defining SVR model initialization and training function
def train_svr(X_train, y_train):
    svr_model = SVR(kernel='linear', C=1.0)
    svr_model.fit(X_train, y_train)
    return svr_model

# Initializing SVR models in parallel
svr_models = Parallel(n_jobs=-1)(delayed(train_svr)(X_train, y_train) for _ in range(10))

# Predicting ratings on the test set using all SVR models
y_preds = [svr_model.predict(X_test) for svr_model in svr_models]

# ensemble predictions calculation as the average of individual predictions
y_pred_ensemble = sum(y_preds) / len(y_preds)

# Evaluating ensemble model
mse = mean_squared_error(y_test, y_pred_ensemble)
mae = mean_absolute_error(y_test, y_pred_ensemble)
r2 = r2_score(y_test, y_pred_ensemble)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)




Finally!!
Using a combination of Filtering Techniques and SVR

In this implementation, Support Vector Regression (SVR) is implemented using the scikit-learn library. The scikit-learn library is chosen because it provides optimized and efficient implementations of machine learning algorithms, including SVR. These implementations are designed to handle large datasets effectively and leverage parallel processing capabilities. By utilizing the scikit-learn library, the SVR algorithm benefits from optimized code and performance enhancements, making it a suitable choice for this application.

In [2]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Creating user-item matrix
user_item_matrix = pd.pivot_table(df, index='userId', columns='movieId', values='rating')
# Handling missing values(fill empty cells with 0)
user_item_matrix = user_item_matrix.fillna(0)

# Collaborative Filtering
# User-Based CF
user_cf_model = NearestNeighbors(metric='cosine', algorithm='brute')
user_cf_model.fit(user_item_matrix.values)
# Item-Based CF
item_cf_model = NearestNeighbors(metric='cosine', algorithm='brute')
item_cf_model.fit(user_item_matrix.T.values)

# Content-Based Filtering
# Feature Extraction (using movie titles)
count_vectorizer = CountVectorizer(stop_words='english')
movie_title_matrix = count_vectorizer.fit_transform(df['title'])

# Latent Dirichlet Allocation
lda_model = LatentDirichletAllocation(n_components=10, random_state=29)
movie_topics = lda_model.fit_transform(movie_title_matrix)
# Profile Building (using average ratings)
user_profiles = df.groupby('userId')['rating'].mean()

#using user_cf_model, item_cf_model, movie_topics, and user_profiles for recommendations
X_train, X_test, y_train, y_test = train_test_split(user_item_matrix.values, user_item_matrix.values, test_size=0.2, random_state=29)

# Flattening the user-item matrices for SVR
X_train_flat = X_train.flatten()
X_test_flat = X_test.flatten()
y_train_flat = y_train.flatten()
y_test_flat = y_test.flatten()

# Initializing SVR model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.2)
# Training SVR model
svr_model.fit(X_train_flat.reshape(-1, 1), y_train_flat)
# Making predictions
svr_predictions = svr_model.predict(X_test_flat.reshape(-1, 1))
# Evaluate SVR model(comparing predicted and already given ratings of movies)
mse = mean_squared_error(y_test_flat, svr_predictions)
mae = mean_absolute_error(y_test_flat, svr_predictions)
r2 = r2_score(y_test_flat, svr_predictions)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)



Mean Squared Error: 0.04008941257973813
Mean Absolute Error: 0.20014085315876584
R^2 Score: 0.8363771051680233


Peforming Cross Validation

In [3]:
#cross validation
!pip install scikit-surprise
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import cross_validate

# Loading data into Surprise format
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
# Defining collaborative filtering algorithm
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
# Performing cross-validation
cv_results = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

# Display cross-validation results
print("Cross-validation results:")
for key, value in cv_results.items():
    print(key, ':', value)

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/772.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m522.2/772.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162986 sha256=401b146c82b0f58299a27096b676b98793d5bd2e8510f54933e3f3be6e4b65d9
  Stored in directory: /root/.cache/pip

Use Functionality of the Model-

Taking input of user IDs and predicting movies suitable to the user based on his/her ratings given to movies present in the dataset

In [8]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
import pandas as pd

def recommend_movies(user_id, user_item_matrix, movie_topics, user_profiles, num_recommendations=5):
    if user_id not in user_item_matrix.index:
        print(f"User ID {user_id} not found.")
        return

    # Loading data into Surprise format
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

    trainset, _ = train_test_split(data, test_size=0.2, random_state=29)

    # Training collaborative filtering model
    sim_options = {'name': 'cosine', 'user_based': True}
    algo = KNNBasic(sim_options=sim_options)
    algo.fit(trainset)

    # Getting top N recommendations
    user_movies = df[df['userId'] == user_id]['movieId']
    unrated_movies = user_item_matrix.columns[~user_item_matrix.columns.isin(user_movies)]
    testset = [[user_id, movie_id, 4] for movie_id in unrated_movies]
    predictions = algo.test(testset)
    top_recs = sorted(predictions, key=lambda x: x.est, reverse=True)[:num_recommendations]
    top_movie_ids = [rec.iid for rec in top_recs]

    # Removing duplicates
    top_movie_ids = list(set(top_movie_ids))

    # Getting unique movie titles for recommended movies
    recommended_movies = df[df['movieId'].isin(top_movie_ids)][['title']]
    unique_movie_titles = recommended_movies['title'].unique()

    # Printing unique movie titles
    print("Recommended Movies:")
    for title in unique_movie_titles:
        print(title)

# Example usage:
user_id = int(input("Enter a user ID: "))
recommend_movies(user_id, user_item_matrix, movie_topics, user_profiles)

Enter a user ID: 222
Computing the cosine similarity matrix...
Done computing similarity matrix.
Recommended Movies:
Lamerica (1994)
Colonel Chabert, Le (1994)
World of Apu, The (Apur Sansar) (1959)
Supercop 2 (Project S) (Chao ji ji hua) (1993)
In the Realm of the Senses (Ai no corrida) (1976)
