# Data Collection

In [18]:
import pandas as pd

# Load the ratings data
ratings_df = pd.read_csv('ratings.dat', delimiter='::',
                         engine='python', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Load the movies data
movies_df = pd.read_csv('movies.dat', delimiter='::',
                        engine='python', names=['movie_id', 'title', 'genres'], encoding='ISO-8859-1')

# Load the users data
users_df = pd.read_csv('users.dat', delimiter='::',
                       engine='python', names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], encoding='ISO-8859-1')


# Preprocessing

In [39]:
# Merge ratings with movies
merged_df = pd.merge(ratings_df, movies_df, on='movie_id')
# Merge with users
merged_df = pd.merge(merged_df, users_df, on='user_id')

# Filter the merged dataframe to keep only relevant columns
merged_df = merged_df[['user_id', 'title', 'rating', 'gender', 'age', 'occupation']]

# Create a pivot table for user ratings
user_rating = merged_df.pivot_table(index='user_id', columns='title', values='rating').fillna(0)

# Optional: Drop movies with very few ratings
user_rating = user_rating.dropna(thresh=20, axis=1)

user_rating.head()

title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Training (Collaborative Filtering)

In [4]:
# Calculate the similarity matrix for movies
similarity_table = user_rating.corr(method='pearson')

# Function to get similar movies based on user ratings
def get_similar_movies(movie_name, user_rating):
    similar_score = similarity_table[movie_name] * (user_rating - 2.5)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score


# Recommendation Generation

In [43]:
# Example user preferences
user_preferences = [('Ponette (1996)', 5), ('Funny Farm (1988)', 3), ('Indochine (1992)', 3.2)]

# Generate similar movies
similar_movies_list = []
for movie, rating in user_preferences:
    similar_movies_list.append(get_similar_movies(movie, rating))

# Combine and sort the recommendations
similar_movies = pd.concat(similar_movies_list, axis=1).sum(axis=1).sort_values(ascending=False)

# Exclude movies the user has already rated
top_10_recommendations = [x for x in similar_movies.index if x not in [movie for movie, _ in user_preferences]][:10]

# Display top 10 recommendations
top_10_recommendations


['Three Seasons (1999)',
 'Farewell My Concubine (1993)',
 'Tom & Viv (1994)',
 'Postino, Il (The Postman) (1994)',
 'Howards End (1992)',
 'Sweet Hereafter, The (1997)',
 "What's Eating Gilbert Grape (1993)",
 'Hilary and Jackie (1998)',
 'Passion Fish (1992)',
 'Mrs. Brown (Her Majesty, Mrs. Brown) (1997)']