#Similarity based Collaborative filtering

In [105]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity

In [106]:
movies_path = "/content/drive/MyDrive/SWM/Project_Data/movies.csv"
users_path = "/content/drive/MyDrive/SWM/Project_Data/users.csv"
ratings_path = "/content/drive/MyDrive/SWM/Project_Data/ratings.csv"

In [107]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [108]:
movies = pd.read_csv(movies_path)
users = pd.read_csv(users_path)
ratings = pd.read_csv(ratings_path)

movies.columns

Index(['movie id', 'movie title', 'release date', 'video release date',
       'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [109]:
def create_ratings_matrix(ratings):
  matrix = np.zeros((ratings["user id"].max(),ratings["item id"].max()))
  for index, row in ratings.iterrows():
    matrix[row["user id"]-1, row["item id"]-1] = row["rating"]
  return matrix
ratings_matrix = create_ratings_matrix(ratings)

In [110]:
ratings_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

# KNN based Collaborative Filtering


In [126]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

class KNNCollaborativeFiltering:
    def __init__(self, movies_df, users_df):
        self.users_df = users_df
        self.movies_df = movies_df
        self.user_knn = None
        self.movie_knn = None
        self.train_knn_models()

    def train_knn_models(self):
        user_features = self.prepare_user_features()
        self.user_knn = NearestNeighbors(metric='cosine')
        self.user_knn.fit(user_features)

        self.movie_knn = NearestNeighbors(metric='cosine')
        movie_features = self.prepare_movie_features()
        self.movie_knn.fit(movie_features)

    def prepare_user_features(self):
        user_features = self.users_df[['age', 'gender', 'occupation']]

        gender_encoder = OneHotEncoder(sparse=False)
        encoded_gender = gender_encoder.fit_transform(user_features[['gender']])
        occupation_encoder = OneHotEncoder(sparse=False)
        encoded_occupation = occupation_encoder.fit_transform(user_features[['occupation']])

        user_features['age'] = user_features['age'] / user_features['age'].max()

        user_features = np.concatenate([user_features[['age']].values, encoded_gender, encoded_occupation], axis=1)

        return user_features

    def prepare_movie_features(self):
        movie_features = self.movies_df.drop(['movie id', 'movie title', 'video release date', 'IMDb URL'], axis=1)

        movie_features['release_year'] = movie_features['release date'].apply(lambda x: int(x[-4:]) if isinstance(x, str) else np.nan)

        movie_features.drop('release date', axis=1, inplace=True)

        imputer = SimpleImputer(strategy='mean')
        movie_features = imputer.fit_transform(movie_features)

        return movie_features

    def recommend_movies(self, ratings_matrix, user_ids, num_recommendations=10):
        recommendations = []

        for user_id in user_ids:
            user_idx = user_id - 1
            rated_movies = np.where(ratings_matrix[user_idx] > 0)[0]

            if len(rated_movies) == 0:
                mean_ratings = np.mean(ratings_matrix, axis=0)
                unrated_top_movies = np.argsort(mean_ratings)[::-1][:num_recommendations]
                recommendations.append([movie_id + 1 for movie_id in unrated_top_movies if movie_id not in rated_movies])
            else:
                distances, user_indices = self.user_knn.kneighbors([self.prepare_user_features()[user_idx]], n_neighbors=20)
                similar_user_indices = user_indices[0][1:]

                predicted_ratings = np.zeros(ratings_matrix.shape[1])

                for sim_user_idx in similar_user_indices:
                    similar_user_ratings = ratings_matrix[sim_user_idx]
                    predicted_ratings += similar_user_ratings

                predicted_ratings /= len(similar_user_indices)

                predicted_ratings[ratings_matrix[user_idx] > 0] = 0

                recommended_movies = np.argsort(predicted_ratings)[::-1][:num_recommendations]

                recommendations.append([movie_id + 1 for movie_id in recommended_movies if movie_id not in rated_movies])

        return recommendations

    def recommend_users(self, ratings_matrix, movie_ids, num_recommendations=10):
        recommendations = []

        for movie_id in movie_ids:
            movie_idx = movie_id - 1
            rated_users = np.where(ratings_matrix[:, movie_idx] > 0)[0]

            if len(rated_users) == 0:
                mean_ratings = np.mean(ratings_matrix, axis=1)
                unrated_top_users = np.argsort(mean_ratings)[::-1][:num_recommendations]
                recommendations.append([user_id + 1 for user_id in unrated_top_users if user_id not in rated_users])
            else:
                distances, movie_indices = self.movie_knn.kneighbors([self.prepare_movie_features()[movie_idx]], n_neighbors=20)
                similar_movie_indices = movie_indices[0][1:]

                predicted_ratings = np.zeros(ratings_matrix.shape[0])

                for sim_movie_idx in similar_movie_indices:
                    similar_movie_ratings = ratings_matrix[:, sim_movie_idx]
                    predicted_ratings += similar_movie_ratings

                predicted_ratings /= len(similar_movie_indices)

                predicted_ratings[ratings_matrix[:, movie_idx] > 0] = 0

                recommended_users = np.argsort(predicted_ratings)[::-1][:num_recommendations]

                recommendations.append([user_id + 1 for user_id in recommended_users if user_id not in rated_users])

        return recommendations

    def calculate_movie_similarity(self, target_movie_idx, similar_movie_indices):
        target_movie_features = self.movies_df.iloc[target_movie_idx, 5:].values
        similar_movie_features = self.movies_df.iloc[similar_movie_indices, 5:].values
        target_movie_features_norm = target_movie_features / np.linalg.norm(target_movie_features)
        similar_movie_features_norm = similar_movie_features / np.linalg.norm(similar_movie_features, axis=1)[:, np.newaxis]
        similarities = np.dot(similar_movie_features_norm, target_movie_features_norm)

        return similarities

    def calculate_user_similarity(self, ratings_matrix, reference_users, target_users):
        reference_ratings = ratings_matrix[reference_users]
        target_ratings = ratings_matrix[target_users]

        similarity_scores = np.dot(reference_ratings, target_ratings.T)

        reference_norms = np.linalg.norm(reference_ratings, axis=1)
        target_norms = np.linalg.norm(target_ratings, axis=1)

        reference_norms = reference_norms[:, np.newaxis]
        target_norms = target_norms[np.newaxis, :]

        similarity_scores /= (reference_norms * target_norms)

        return similarity_scores

    def find_similar_movies(self, movie_idx):
        movie_features = self.prepare_movie_features()
        _, movie_indices = self.movie_knn.kneighbors([movie_features[movie_idx]])
        similar_movie_indices = movie_indices[0][1:]
        return similar_movie_indices

    def find_similar_users(self, user_indices):
        user_features = self.prepare_user_features()
        _, user_indices = self.user_knn.kneighbors(user_features[user_indices])
        similar_user_indices = user_indices[0][1:]
        return similar_user_indices

    def predict_rating(self, ratings_matrix, rating_ids):
        predictions = []

        for user_id, movie_id in rating_ids:
            user_idx = user_id - 1
            movie_idx = movie_id - 1
            _, user_indices = self.user_knn.kneighbors([self.prepare_user_features()[user_idx]])
            similar_user_indices = user_indices[0][1:]
            similar_user_ratings = ratings_matrix[similar_user_indices, movie_idx]
            user_rating_prediction = np.mean(similar_user_ratings) if len(similar_user_ratings) > 0 else np.nan
            _, movie_indices = self.movie_knn.kneighbors([self.prepare_movie_features()[movie_idx]])
            similar_movie_indices = movie_indices[0][1:]
            similar_movie_ratings = ratings_matrix[user_idx, similar_movie_indices]
            movie_rating_prediction = np.mean(similar_movie_ratings) if len(similar_movie_ratings) > 0 else np.nan
            combined_prediction = 5 - (0.5 * user_rating_prediction + 0.5 * movie_rating_prediction)
            predictions.append(combined_prediction)

        return predictions


#Evaluate

In [124]:
max_num_test = 10

def rmse(values_1, values_2):
  return np.sqrt(np.square(values_1 - values_2).mean())

def precision(true_values, predicted_values):
  true_values = set(true_values)
  predicted_values = set(predicted_values)
  intersection = true_values.intersection(predicted_values)
  return len(intersection)/len(predicted_values)

def evaluate_scenario_1(model, ratings_matrix, num_recommendations = 10):
  #New user - cold start
  user_indexes = []
  for ii, ratings in enumerate(ratings_matrix):
    if(len(np.where(ratings>3)[0]) > 2*num_recommendations):
      user_indexes.append(ii)
  user_indexes = np.array(user_indexes[:max_num_test])

  modified_ratings = ratings_matrix.copy()
  modified_ratings[user_indexes] = 0

  recommended_movies = model.recommend_movies(modified_ratings, user_indexes+1, num_recommendations)
  true_values = [np.where(ratings_matrix[user_index]>3)[0]+1 for user_index in user_indexes]
  precisions = np.array([precision(true_values[ii], recommended_movies[ii]) for ii in range(len(user_indexes))])
  return precisions.mean()

def evaluate_scenario_2(model, ratings_matrix, num_recommendations = 10):
  #New movie - cold start
  movie_indexes = []
  for ii, ratings in enumerate(ratings_matrix.T):
    if(len(np.where(ratings>3)[0]) > 2*num_recommendations):
      movie_indexes.append(ii)
  movie_indexes = np.array(movie_indexes[:max_num_test])

  modified_ratings = ratings_matrix.copy()
  modified_ratings[:,movie_indexes] = 0

  recommended_users = model.recommend_users(modified_ratings, movie_indexes+1, num_recommendations)
  true_values = [np.where(ratings_matrix.T[movie_index]>3)[0]+1 for movie_index in movie_indexes]
  precisions = np.array([precision(true_values[ii], recommended_users[ii]) for ii in range(len(movie_indexes))])
  return precisions.mean()

def evaluate_scenario_3(model, ratings_matrix, num_recommendations = 10):
  #Given user
  user_indexes = []
  for ii, ratings in enumerate(ratings_matrix):
    if(len(np.where(ratings>3)[0]) > 4*num_recommendations):
      user_indexes.append(ii)
  user_indexes = np.array(user_indexes[:max_num_test])

  modified_ratings = ratings_matrix.copy()
  true_values = []
  for user_index in user_indexes:
    removed_indexes = np.where(ratings_matrix[user_index]>3)[0][::2]
    true_values.append(removed_indexes+1)
    modified_ratings[user_index, removed_indexes] = 0

  recommended_movies = model.recommend_movies(modified_ratings, user_indexes+1, num_recommendations)
  precisions = np.array([precision(true_values[ii], recommended_movies[ii]) for ii in range(len(user_indexes))])
  return precisions.mean()

def evaluate_scenario_4(model, ratings_matrix, num_recommendations = 10):
  #Given movie
  movie_indexes = []
  for ii, ratings in enumerate(ratings_matrix.T):
    if(len(np.where(ratings>3)[0]) > 4*num_recommendations):
      movie_indexes.append(ii)
  movie_indexes = np.array(movie_indexes[:max_num_test])

  modified_ratings = ratings_matrix.copy()
  true_values = []
  for movie_index in movie_indexes:
    removed_indexes = np.where(ratings_matrix.T[movie_index]>3)[0][::2]
    true_values.append(removed_indexes+1)
    modified_ratings[removed_indexes, movie_index] = 0

  recommended_users = model.recommend_users(modified_ratings, movie_indexes+1, num_recommendations)
  precisions = np.array([precision(true_values[ii], recommended_users[ii]) for ii in range(len(movie_indexes))])
  return precisions.mean()

def evaluate_scenario_5(model, ratings_matrix, num_recommendations = 10):
  #Predict rating
  user_ids = np.array(range(0,ratings_matrix.shape[0],20))+1
  movie_ids = np.array(range(0,ratings_matrix.shape[1],20))+1

  rating_ids = []
  for uid in user_ids:
    for mid in movie_ids:
      if(ratings_matrix[uid-1, mid-1]!=0):
        rating_ids.append((uid, mid))

  true_values = np.array([ratings_matrix[uid-1, mid-1] for uid, mid in rating_ids])
  predicted_values = model.predict_rating(ratings_matrix, rating_ids)
  return rmse(true_values, predicted_values)



In [127]:
import warnings

warnings.simplefilter('ignore')

model = KNNCollaborativeFiltering(movies, users)
print("evaluate_scenario_1: ", evaluate_scenario_1(model, ratings_matrix,10))
print("evaluate_scenario_2: ", evaluate_scenario_2(model, ratings_matrix,10))
print("evaluate_scenario_3: ", evaluate_scenario_3(model, ratings_matrix,10))
print("evaluate_scenario_4: ", evaluate_scenario_4(model, ratings_matrix,10))
print("evaluate_scenario_5: ", evaluate_scenario_5(model, ratings_matrix))

evaluate_scenario_1:  0.5800000000000001
evaluate_scenario_2:  0.4499999999999999
evaluate_scenario_3:  0.55
evaluate_scenario_4:  0.6
evaluate_scenario_5:  1.8715730957441064
