In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy

In [2]:
# To Load the user ratings dataset

dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/combined_data_1.txt', sep=",", header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
dataset.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [3]:
# To count all the 'nan' values in the Ratings column in the 'ratings' dataset

df_nan = pd.DataFrame(pd.isnull(dataset.Rating))
df_nan.head()

Unnamed: 0,Rating
0,True
1,False
2,False
3,False
4,False


In [4]:
# To store the index of all the rows containing 'nan' values

df_nan = df_nan[df_nan['Rating'] == True]
df_nan.shape

(4499, 1)

In [5]:
# To reset the index of the dataframe

df_nan = df_nan.reset_index()
df_nan.head()

Unnamed: 0,index,Rating
0,0,True
1,548,True
2,694,True
3,2707,True
4,2850,True


In [6]:
#To create a numpy array containing movie ids according the 'ratings' dataset

movie_lengths = np.diff(df_nan['index'])
total_length = len(dataset)

unique_movie_ids = np.arange(1, len(movie_lengths) + 2)

ratings_per_movie = np.repeat(unique_movie_ids[:-1], movie_lengths)
last_movie_id = unique_movie_ids[-1]
ratings_last_movie = np.full(total_length - df_nan.iloc[-1, 0] - 1, last_movie_id)

movie_np = np.concatenate([ratings_per_movie, ratings_last_movie])

desired_length = 24053764

movie_np = movie_np[:desired_length]

print(f'Movie numpy: {movie_np}')
print(f'Length: {len(movie_np)}')


Movie numpy: [   1    1    1 ... 4496 4496 4496]
Length: 24053764


In [7]:
#To append the above created array to the datset after removing the 'nan' rows

dataset = dataset[pd.notnull(dataset['Rating'])]
dataset['Movie_Id'] = movie_np.astype(int)
dataset['Cust_Id'] =dataset['Cust_Id'].astype(int)
dataset.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


In [8]:
# To load the movie_titles dataset

df_title = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movie_titles.csv', encoding = "ISO-8859-1", header = None, sep=",", names=["Movie_Id", "Year", "Name"],  usecols=[0, 2])
print(df_title.head())

   Movie_Id                          Name
0         1               Dinosaur Planet
1         2    Isle of Man TT 2004 Review
2         3                     Character
3         4  Paula Abdul's Get Up & Dance
4         5      The Rise and Fall of ECW


### Recommendation Engine for each user

In [9]:
# Define the number of users and movies to consider (for resource management)
max_users = 1000
max_movies = 1000

# Filter data to limit the number of users and movies
ratings = dataset[dataset['Cust_Id'] <= max_users]
ratings = ratings.sample(frac=1).groupby('Cust_Id').head(max_movies)  # Randomly sample movies per user

# Merge datasets
data = pd.merge(ratings, df_title, left_on='Cust_Id', right_on='Movie_Id', how='inner')

# Data Preprocessing
# - Fill missing values
# - Normalize the 'Rating' column
data['Rating'].fillna(0, inplace=True)
scaler = MinMaxScaler()
data['Rating'] = scaler.fit_transform(data['Rating'].values.reshape(-1, 1))

# Create a user-item matrix
user_item_matrix = data.pivot_table(index='Cust_Id', columns='Name', values='Rating')

# Calculate cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix.fillna(0))

# Function to get movie recommendations for a user
def get_movie_recommendations(user_id, num_recommendations=10):
    user_ratings = user_item_matrix.loc[user_id]
    similar_users = user_similarity[user_id]
    recommendations = user_item_matrix.columns.to_frame().reset_index(drop=True)

    # Calculate weighted sum of ratings by similar users
    recommendations['Weighted_Rating'] = np.dot(similar_users, user_ratings.fillna(0))

    # Filter out already rated movies
    rated_movies = data[data['Cust_Id'] == user_id]['Name']
    recommendations = recommendations[~recommendations['Name'].isin(rated_movies)]

    # Sort by weighted rating in descending order
    recommendations = recommendations.sort_values(by='Weighted_Rating', ascending=False)

    return recommendations.head(num_recommendations)

# Get recommendations for a specific user (replace with a user ID)
user_id = 7

if user_id in user_item_matrix.index:
    recommended_movies = get_movie_recommendations(user_id)
    print("Recommended Movies for User", user_id)
    print(recommended_movies)
else:
    print(f"User {user_id} does not exist in the filtered dataset.")

Recommended Movies for User 7
                                                  Name  Weighted_Rating
1                                   A Hole in the Head              0.0
2                                      A Killer Within              0.0
113                                           Pressure              0.0
114                                       Regular Guys              0.0
115                                     Rhyme & Reason              0.0
116                                Running Out of Time              0.0
117                            SCTV Network 90: Vol. 4              0.0
118                                         Saint Jack              0.0
119  Satanis: The Devil's Mass / Sinthia: The Devil...              0.0
120        Saturday Night Live: The Best of Jon Lovitz              0.0


### Objective 1: Find the list of most popular and liked genres

In [10]:
# Calculate average ratings by movie
movie_ratings = dataset.groupby('Cust_Id')['Rating'].mean().reset_index()
movie_ratings = movie_ratings.rename(columns={'Rating': 'Avg_Rating'})

# Merge movie ratings with the movies dataset
movies_with_ratings = pd.merge(df_title, movie_ratings, left_on='Movie_Id', right_on='Cust_Id')

highest_rated_movie = movies_with_ratings[movies_with_ratings['Avg_Rating'] == movies_with_ratings['Avg_Rating'].max()]
print("Movie with the Highest Average Rating:")
print(highest_rated_movie[['Name', 'Avg_Rating']])

most_rated_movie = movies_with_ratings[movies_with_ratings['Avg_Rating'] == movies_with_ratings['Avg_Rating'].max()]
print("\nMovie with the Most Ratings:")
print(most_rated_movie[['Name', 'Avg_Rating']])

Movie with the Highest Average Rating:
                                    Name  Avg_Rating
73                              Elephant         5.0
89               The Passion of Ayn Rand         5.0
94                   Avia Vampire Hunter         5.0
184   Annie: Special Anniversary Edition         5.0
363             Endless Summer Revisited         5.0
...                                  ...         ...
3010                        If Lucy Fell         5.0
3044       Dave Matthews Band: The Gorge         5.0
3064                 Andromeda: Season 5         5.0
3102     Who's Afraid of Virginia Woolf?         5.0
3108                   Cops: Shots Fired         5.0

[80 rows x 2 columns]

Movie with the Most Ratings:
                                    Name  Avg_Rating
73                              Elephant         5.0
89               The Passion of Ayn Rand         5.0
94                   Avia Vampire Hunter         5.0
184   Annie: Special Anniversary Edition         5.0
363    

### Objective 2: Create a model to find the best-suited movie for one user in every genre

In [11]:
# Replace this with an actual user ID
user_id = 6

user_movie_ratings = dataset[dataset['Cust_Id'] == user_id]

user_movies = pd.merge(df_title, user_movie_ratings, left_on='Movie_Id', right_on='Cust_Id')

best_suited_movies = user_movies.groupby('Name')['Rating'].max().reset_index()
print("\nBest Suited Movies for User", user_id, "in Each Genre:")
print(best_suited_movies)


Best Suited Movies for User 6 in Each Genre:
   Name  Rating
0  Sick     5.0


### Objective 3: Find which genre movies have received the best and worst ratings based on user ratings

In [12]:
genre_ratings = movies_with_ratings.groupby('Name')['Avg_Rating'].mean().reset_index()
best_genre = genre_ratings[genre_ratings['Avg_Rating'] == genre_ratings['Avg_Rating'].max()]
worst_genre = genre_ratings[genre_ratings['Avg_Rating'] == genre_ratings['Avg_Rating'].min()]

print("\nGenre with the Best Average Rating:")
print(best_genre)

print("\nGenre with the Worst Average Rating:")
print(worst_genre)


Genre with the Best Average Rating:
                                                   Name  Avg_Rating
151                                 Andromeda: Season 5         5.0
158                  Annie: Special Anniversary Edition         5.0
184                                 Avia Vampire Hunter         5.0
205                                            Badlands         5.0
232   Battlestar Galactica: The Miniseries: Bonus Ma...         5.0
...                                                 ...         ...
3004                               WWE: Summerslam 2003         5.0
3054                    Who's Afraid of Virginia Woolf?         5.0
3063                                      Wilder Napalm         5.0
3095              Xena: Warrior Princess: Series Finale         5.0
3113                             Zatoichi Meets Yojimbo         5.0

[80 rows x 2 columns]

Genre with the Worst Average Rating:
                                                   Name  Avg_Rating
277               