In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load data into dataframes with Latin-1 encoding
movies_df = pd.read_csv('/kaggle/input/movielens/ml-1m/movies.dat', sep='::', engine='python', header=None, encoding='ISO-8859-1')
ratings_df = pd.read_csv('/kaggle/input/movielens/ml-1m/ratings.dat', sep='::', engine='python', header=None, encoding='ISO-8859-1')
users_df = pd.read_csv('/kaggle/input/movielens/ml-1m/users.dat', sep='::', engine='python', header=None, encoding='ISO-8859-1')

# Display the first few rows of each dataframe
print("Movies data:")
print(movies_df.head())

print("\nRatings data:")
print(ratings_df.head())

print("\nUsers data:")
print(users_df.head())


In [None]:
# Define header columns
movies_columns = ['MovieID', 'Title', 'Genres']
ratings_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
users_columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

# Assign header columns
movies_df.columns = movies_columns
ratings_df.columns = ratings_columns
users_df.columns = users_columns

movies_df.columns, ratings_df.columns, users_df.columns

In [None]:
print("Movies data:")
print(movies_df.head())

print("\nRatings data:")
print(ratings_df.head())

print("\nUsers data:")
print(users_df.head())

In [None]:
movies_df

## Neighbourhood-based Collaborative Filtering

In [None]:
# Merge ratings dataframe with movie dataframe on MovieID to get movie titles
ratings_with_titles = pd.merge(ratings_df, movies_df[['MovieID', 'Title']], on='MovieID')

# Pivot table to create a ratings matrix where rows represent users, columns represent movies, and values represent ratings
ratings_matrix = ratings_with_titles.pivot_table(index='UserID', columns='Title', values='Rating')

# Display the ratings matrix
print("Ratings Matrix:")
print(ratings_matrix.head())

### Compute Cosine Similarity

In [None]:
def cosine_similarity(vec1, vec2):
    # Find indices where both vectors have non-NaN values
    valid_indices = np.logical_and(~np.isnan(vec1), ~np.isnan(vec2))
    
    # Extract non-NaN values from both vectors
    vec1_valid = vec1[valid_indices]
    vec2_valid = vec2[valid_indices]
    
    # Compute dot product
    dot_product = np.dot(vec1_valid, vec2_valid)
    
    # Compute magnitudes
    magnitude_vec1 = np.linalg.norm(vec1_valid)
    magnitude_vec2 = np.linalg.norm(vec2_valid)
    
    # Compute cosine similarity
    if magnitude_vec1 == 0 or magnitude_vec2 == 0:
        return 0  # Handle division by zero
    else:
        similarity = dot_product / (magnitude_vec1 * magnitude_vec2)
        return similarity


### Jaccard similarity

In [None]:
def jaccard_similarity(vec1, vec2):
    # Find indices where both vectors have non-zero values
    non_zero_indices = np.logical_and(vec1 != 0, vec2 != 0)
    
    # Compute intersection and union of non-zero elements
    intersection = np.sum(non_zero_indices)
    union = np.sum(np.logical_or(vec1 != 0, vec2 != 0))
    
    # Compute Jaccard similarity
    if union == 0:
        return 0
    else:
        return intersection / union

### Pearson correlation

In [None]:
def pearson_corr(vec1, vec2):
    # Find indices where both vectors have non-NaN values and ratings are present for both users
    valid_indices = np.logical_and(~np.isnan(vec1), ~np.isnan(vec2))
    valid_indices = np.logical_and(valid_indices, vec1 != 0)
    valid_indices = np.logical_and(valid_indices, vec2 != 0)
    
    # Extract non-NaN values from both vectors
    vec1_valid = vec1[valid_indices]
    vec2_valid = vec2[valid_indices]
    
    # Compute means
    mean_vec1 = np.mean(vec1_valid)
    mean_vec2 = np.mean(vec2_valid)
    
    # Compute covariance
    covariance = np.sum((vec1_valid - mean_vec1) * (vec2_valid - mean_vec2))
    
    # Compute standard deviations
    std_dev_vec1 = np.sqrt(np.sum((vec1_valid - mean_vec1) ** 2))
    std_dev_vec2 = np.sqrt(np.sum((vec2_valid - mean_vec2) ** 2))
    
    # Compute Pearson correlation coefficient
    if std_dev_vec1 == 0 or std_dev_vec2 == 0:
        return 0  # Handle division by zero
    else:
        correlation = covariance / (std_dev_vec1 * std_dev_vec2)
        return correlation


### Find Similar Users

In [None]:
def find_similar_users(user_id, ratings_matrix, k=5):
    user_ratings = ratings_matrix.loc[user_id]  # Get ratings of the given user
    similarities = {}  # Dictionary to store similarities with other users
    
    # Iterate over all users
    for other_user_id, other_user_ratings in ratings_matrix.iterrows():
        if other_user_id != user_id:
            # Compute cosine similarity with the given user, considering only common ratings
            similarity = cosine_similarity(user_ratings.values, other_user_ratings.values)
#             similarity = pearson_corr(user_ratings.values, other_user_ratings.values)
#             similarity = jaccard_similarity(user_ratings.values, other_user_ratings.values)
            similarities[other_user_id] = similarity
    
    # Sort similarities and get top k similar users
    similar_users = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:k]
    
    return similar_users


### Recommend Movies

In [None]:
def recommend_movies(user_id, ratings_matrix, similar_users, top_n=5):
    user_ratings = ratings_matrix.loc[user_id]  # Get ratings of the given user
    recommended_movies = {}  # Dictionary to store recommended movies and their scores
    
    # Convert similar_users list to a dictionary
    similar_users_dict = dict(similar_users)
    
    # Iterate over similar users and their ratings
    for similar_user_id, similarity in similar_users_dict.items():
        similar_user_ratings = ratings_matrix.loc[similar_user_id]  # Get ratings of similar user
        for movie_id, rating in similar_user_ratings.items():
            # Only consider movies not rated by the given user
            if pd.isnull(user_ratings[movie_id]) and not pd.isnull(rating):
                if movie_id in recommended_movies:
                    # Add the weighted rating of the movie by the similar user
                    recommended_movies[movie_id] += similarity * rating
                else:
                    # Initialize the movie score with the weighted rating
                    recommended_movies[movie_id] = similarity * rating
    
    # Calculate the total similarity score for normalization
    total_similarity = sum(similar_users_dict.values())
    
    # Calculate the final rating for each recommended movie
    for movie_id in recommended_movies:
        recommended_movies[movie_id] /= total_similarity
    
    # Sort recommended movies by score and get top n
    recommended_movies = sorted(recommended_movies.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    return [movie_id for movie_id, _ in recommended_movies]




In [None]:
def recommend_movies_for_new_user(new_user_ratings, ratings_matrix,movies_df, top_n=5):

    # Add new user ratings to the ratings matrix
    new_user_id = ratings_matrix.index.max() + 1
    print("New user ID:", new_user_id)
    new_user_ratings_aligned = {}
    for movie_id, rating in new_user_ratings.items():
        movie_title = movies_df[movies_df['MovieID'] == movie_id]['Title'].values[0]
        new_user_ratings_aligned[movie_title] = rating

    # Creating a Pandas Series with aligned indices
    new_user_ratings_series = pd.Series(new_user_ratings_aligned, index=ratings_matrix.columns)
    
    ratings_matrix.loc[new_user_id] = new_user_ratings_series
    
    new_user_ratings_non_nan = ratings_matrix.loc[new_user_id].dropna()
    for title, rating in new_user_ratings_non_nan.items():
        print(f"{title}: {rating}")
    # Find similar users
    similar_users = find_similar_users(new_user_id, ratings_matrix)
    print("Similar users:", similar_users)

    # Recommend movies
    recommended_movies = recommend_movies(new_user_id, ratings_matrix, similar_users, top_n)
    print("Recommended movies:", recommended_movies)
    
    return recommended_movies

### Testing for Existing user

In [None]:
user_id =12
similar_users = find_similar_users(user_id, ratings_matrix)
# print(similar_users)
recommended_movies = recommend_movies(user_id, ratings_matrix, similar_users)
print("Recommended movies:", recommended_movies)

In [None]:
# Just checking if that user has already that movie before

# Movies rated by user_id 
movies_rated_by_user = ratings_matrix.loc[user_id].dropna().index

# Check if recommended movies are already rated by user_id 1
for movie_title in recommended_movies:
    if movie_title in movies_rated_by_user:
        print(f"User has rated '{movie_title}' before.")
    else:
        print(f"User has not rated '{movie_title}' before.")


### Testing for New User

In [None]:
# Example usage:
new_user_ratings = {
    1: 5,  # MovieID: Rating
    2:5,
    3: 5,
    41:5,
    50:5
    # Add more ratings as needed
}

recommended_movies = recommend_movies_for_new_user(new_user_ratings, ratings_matrix,movies_df)


####  Checking by genre

In [None]:
genre_movies = movies_df[movies_df['Genres'].str.contains('Action')]
# Take the first 10 movie IDs from the filtered DataFrame
genre_movie_ids = genre_movies.head(10)['MovieID'].tolist()

print("10 Movie IDs:", genre_movie_ids)

In [None]:
new_user_ratings = {movie_id: 5 for movie_id in genre_movie_ids}

print("New User Ratings:")
print(new_user_ratings)

In [None]:
recommended_movies = recommend_movies_for_new_user(new_user_ratings, ratings_matrix,movies_df)

In [None]:
# Initialize a list to store genres of recommended movies
recommended_genres = []

# Iterate over recommended movie titles
for movie_title in recommended_movies:
    # Find the row in movies_df where the Title matches the recommended movie title
    movie_row = movies_df[movies_df['Title'] == movie_title]
    # If the movie is found in movies_df
    if not movie_row.empty:
        # Get the genres of the movie and append to the list
        recommended_genres.append(movie_row['Genres'].iloc[0])
    else:
        # If the movie is not found, append None
        recommended_genres.append(None)

# Print the genres of recommended movies
for movie, genre in zip(recommended_movies, recommended_genres):
    print(f"{movie}: {genre}")
