In [7]:
import pandas as pd

# Load ratings export
ratings = pd.read_csv('data/ratings_export.csv')

# Try different approaches to read the problematic CSV file
try:
    # First attempt with more robust parsing options
    movies = pd.read_csv('data/movie_data.csv', 
                        on_bad_lines='skip',
                        quoting=1,  # QUOTE_ALL
                        engine='python')  # Use python engine for more flexibility
    print("Successfully loaded movies data with method 1")
except Exception as e:
    print(f"First attempt failed: {e}")
    try:
        # Second attempt with different settings
        movies = pd.read_csv('data/movie_data.csv', 
                            on_bad_lines='skip',
                            sep=',',
                            engine='python',
                            encoding='utf-8',
                            quotechar='"',
                            escapechar='\\')
        print("Successfully loaded movies data with method 2")
    except Exception as e2:
        print(f"Second attempt failed: {e2}")
        # Third attempt reading in chunks
        chunk_list = []
        chunk_size = 10000
        try:
            for chunk in pd.read_csv('data/movie_data.csv', 
                                   chunksize=chunk_size,
                                   on_bad_lines='skip',
                                   engine='python'):
                chunk_list.append(chunk)
            movies = pd.concat(chunk_list, ignore_index=True)
            print("Successfully loaded movies data with chunking method")
        except Exception as e3:
            print(f"All attempts failed: {e3}")
            print("Loading a smaller sample of the data...")
            # Load only first 10000 rows as fallback
            movies = pd.read_csv('data/movie_data.csv', 
                               nrows=10000,
                               on_bad_lines='skip',
                               engine='python')
            print("Successfully loaded movies data with sample method")

# Load your ratings and watched data
my_ratings = pd.read_csv('data/processed/ratings_tmdb_cleaned.csv')
watched = pd.read_csv('data/watched.csv')

print(f"Loaded {len(ratings)} user ratings")
print(f"Loaded {len(movies)} movies")  
print(f"Loaded {len(my_ratings)} of your ratings")
print(f"Loaded {len(watched)} watched movies")

# Merge datasets on movie_id to create a comprehensive dataset
user_ratings = pd.merge(ratings, movies, left_on='movie_id', right_on='movie_id')

Successfully loaded movies data with method 1
Loaded 11078167 user ratings
Loaded 285963 movies
Loaded 237 of your ratings
Loaded 426 watched movies


In [3]:
user_ratings.head()

Unnamed: 0,_id_x,movie_id,rating_val,user_id,_id_y,genres,image_url,imdb_id,imdb_link,movie_title,...,popularity,production_countries,release_date,runtime,spoken_languages,tmdb_id,tmdb_link,vote_average,vote_count,year_released
0,5fc57c5d6758f6963451a07f,feast-2014,7,deathproof,5fc880726758f69634df0bca,"[""Animation"",""Comedy"",""Drama"",""Family""]",film-poster/2/2/0/1/9/2/220192-feast-0-230-0-3...,tt3689498,http://www.imdb.com/title/tt3689498/maindetails,Feast,...,9.26,"[""United States of America""]",2014-10-25,6.0,"[""English""]",293299.0,https://www.themoviedb.org/movie/293299/,7.9,720.0,2014.0
1,5fc57c5d6758f6963451a063,loving-2016,7,deathproof,5fc879b26758f69634bf9665,"[""Romance"",""Drama""]",sm/upload/yp/k3/5v/2p/wzi191DNSs08gDQHHUxYwlxC...,tt4669986,http://www.imdb.com/title/tt4669986/maindetails,Loving,...,18.024,"[""United Kingdom"",""United States of America""]",2016-11-04,123.0,"[""English""]",339419.0,https://www.themoviedb.org/movie/339419/,6.7,759.0,2016.0
2,5fc57c5d6758f6963451a0ef,scripted-content,7,deathproof,5fc880406758f69634ddd358,"[""Comedy""]",film-poster/2/7/2/9/1/1/272911-scripted-conten...,tt4073494,http://www.imdb.com/title/tt4073494/maindetails,Scripted Content,...,1.4,"[""United States of America""]",2014-06-01,2.0,[],342914.0,https://www.themoviedb.org/movie/342914/,6.8,5.0,2014.0
3,5fc57c5d6758f6963451a060,the-future,4,deathproof,5fc882366758f69634eac62c,"[""Drama"",""Fantasy"",""Romance""]",film-poster/1/1/4/3/2/11432-the-future-0-230-0...,tt1235170,http://www.imdb.com/title/tt1235170/maindetails,The Future,...,5.208,"[""Germany"",""United States of America"",""France""...",2011-07-29,91.0,"[""English""]",54662.0,https://www.themoviedb.org/movie/54662/,6.0,60.0,2011.0
4,5fc57c5c6758f69634519398,mank,5,deathproof,5fc884286758f69634f3ceca,"[""Drama"",""History""]",film-poster/5/4/1/4/2/5/541425-mank-0-230-0-34...,tt10618286,http://www.imdb.com/title/tt10618286/maindetails,Mank,...,16.331,"[""United States of America""]",2020-11-13,132.0,"[""English""]",614560.0,https://www.themoviedb.org/movie/614560/,6.9,1077.0,2020.0


In [4]:
len(user_ratings)

11079666

In [5]:
len(my_ratings)

237

In [8]:
# Debug: Check the structure of my_ratings
print("Columns in my_ratings:")
print(my_ratings.columns.tolist())
print("\nFirst few rows:")
print(my_ratings[['Name', 'Year', 'Rating', 'id']].head())
print(f"\nRating column stats:")
print(my_ratings['Rating'].describe())

Columns in my_ratings:
['Logged_Date', 'Name', 'Year', 'Rating', 'Rewatch', 'Tags', 'Watched Date', 'id', 'english_language', 'overview', 'popularity', 'vote_average', 'vote_count', 'revenue', 'runtime', 'tagline', 'Rating.1', 'Rewatch_y', 'Review', 'Tags_y', 'Watched Date_y', 'Logged_DOW', 'Logged_Month', 'Logged_Year', 'Logged_Week', 'Daily_Movie_Count', 'Weekly_Movie_Count']

First few rows:
                                                Name  Year    Rating     id
0                                             WALL·E  2008   10681.0  8.102
1                         E.T. the Extra-Terrestrial  1982     601.0  7.500
2                                     Monsters, Inc.  2001     585.0  7.846
3                                               Cars  2006     920.0  7.011
4  Transformers Prime: Beast Hunters - Predacons ...  2013  268092.0  8.000

Rating column stats:
count    2.370000e+02
mean     2.430401e+05
std      3.059355e+05
min      1.200000e+01
25%      1.771000e+03
50%      8.489

In [9]:
# Check Rating.1 column which might be the correct rating
print("Rating.1 column stats:")
print(my_ratings['Rating.1'].describe())
print("\nFirst few values of Rating.1:")
print(my_ratings['Rating.1'].head(10))

Rating.1 column stats:
count    237.000000
mean       0.375527
std        1.203945
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        5.000000
Name: Rating.1, dtype: float64

First few values of Rating.1:
0    0.0
1    0.0
2    0.0
3    5.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: Rating.1, dtype: float64


In [10]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Fix the column mapping - it appears the columns got mixed up during cleaning
# The 'Rating' column actually contains TMDB IDs, and we need to use Rating.1 for actual ratings
my_ratings_corrected = my_ratings.copy()

# Use 'Rating' as the TMDB ID (which contains the actual movie IDs)
my_ratings_corrected['tmdb_id'] = my_ratings_corrected['Rating'].astype(int)

# Use 'Rating.1' as the actual rating, but only non-zero values
my_ratings_corrected = my_ratings_corrected[my_ratings_corrected['Rating.1'] > 0]

print(f"After filtering for non-zero ratings: {len(my_ratings_corrected)} ratings")

# Handle NaN values in movies tmdb_id and convert to int
movies_clean = movies.dropna(subset=['tmdb_id'])
movies_clean = movies_clean.copy()  # Avoid SettingWithCopyWarning
movies_clean['tmdb_id'] = movies_clean['tmdb_id'].astype(int)

print(f"Movies shape after cleaning: {movies_clean.shape}")

# Merge with movies data
merged = my_ratings_corrected.merge(movies_clean[['tmdb_id', 'movie_title', 'year_released']], 
                                   on='tmdb_id', 
                                   how='inner')

print(f"Successfully matched {len(merged)} of your ratings with the movie database")

if len(merged) > 0:
    # Create the final ratings dataset
    my_ratings_updated = merged[['tmdb_id', 'Rating.1']].copy()
    my_ratings_updated = my_ratings_updated.rename(columns={'Rating.1': 'Rating'})
    my_ratings_updated = my_ratings_updated.groupby('tmdb_id').agg({'Rating': 'mean'}).reset_index()
    my_ratings_updated['user_id'] = "brimell"

    print(f"Final ratings dataset: {len(my_ratings_updated)} unique movies")
    print("\nSample of your ratings:")
    sample_with_titles = my_ratings_updated.merge(movies_clean[['tmdb_id', 'movie_title']], on='tmdb_id')
    print(sample_with_titles[['movie_title', 'Rating']].head(10))
else:
    print("No matches found between your ratings and the movie database!")
    my_ratings_updated = pd.DataFrame(columns=['tmdb_id', 'Rating', 'user_id'])

After filtering for non-zero ratings: 22 ratings
Movies shape after cleaning: (279803, 19)
Successfully matched 17 of your ratings with the movie database
Final ratings dataset: 17 unique movies

Sample of your ratings:
                movie_title  Rating
0            Before Sunrise     5.0
1             Before Sunset     4.5
2    The Godfather: Part II     3.5
3  The Shawshank Redemption     4.5
4         Good Will Hunting     5.0
5                      Cars     5.0
6  The Pursuit of Happyness     2.0
7             Battle Royale     4.5
8                 Atonement     4.0
9   Kiki's Delivery Service     4.0


In [11]:
# Map movie IDs and user IDs to indices for creating a sparse matrix
tmdb_id_to_idx = {tmdb_id: i for i, tmdb_id in enumerate(user_ratings['tmdb_id'].unique())}
user_id_to_idx = {user_id: i + 1 for i, user_id in enumerate(user_ratings['user_id'].unique())} # add one to avoid 0 index so that we can use 0 for ourselves

# Add your ratings to the user ratings DataFrame
combined_ratings = pd.concat([user_ratings, my_ratings_updated.rename(columns={'Rating': 'rating_val'})])
# remove empty rows
combined_ratings = combined_ratings.dropna(subset=['user_id', 'rating_val'])

user_id_to_idx["brimell"] = 0  # Add ourselves to the mapping
user_id_to_idx

{'deathproof': 1,
 'kurstboy': 2,
 'davidehrlich': 3,
 'adrianbalboa': 4,
 'ingridgoeswest': 5,
 'silentdawn': 6,
 'colonelmortimer': 7,
 'jay': 8,
 'superpulse': 9,
 'thejoshl': 10,
 'dirkh': 11,
 'ianamurray': 12,
 'lilfilm': 13,
 'elihayes': 14,
 'holliehorror': 15,
 'juggernaut323': 16,
 'fuchsiadyke': 17,
 'childrenofmen': 18,
 'settingsun': 19,
 'iaiaiand': 20,
 'suspirliam': 21,
 'kun': 22,
 'gemko': 23,
 'darrencb': 24,
 'nevermore1985': 25,
 'russman': 26,
 'cantinaband': 27,
 'davidlsims': 28,
 'ihe': 29,
 'bratpitt': 30,
 'nycsubwayrat': 31,
 'tarantulini': 32,
 'mr_dulac': 33,
 'filipe_furtado': 34,
 'andredenervaux': 35,
 'cinemaclown': 36,
 'allisoncm': 37,
 'sopheyquinn': 38,
 'kaylafavia': 39,
 'enniomorricone': 40,
 'sonofjorel': 41,
 'zoltarak': 42,
 'schaffrillas': 43,
 'todd_gaines': 44,
 'davidfinchher': 45,
 'alexlawther': 46,
 'screeningnotes': 47,
 'sharktale': 48,
 'hammerbros94': 49,
 'blockbustedpod': 50,
 'punchdrunklizzy': 51,
 'uncutgems': 52,
 'arkhamoutl

In [13]:
# Get the movies rated by you
your_movies = combined_ratings[combined_ratings['user_id'] == "brimell"]
print(f"You've rated {len(your_movies)} movies in the combined dataset")

# Merge your_movies with combined_ratings to find common movies
common_movies = pd.merge(your_movies, combined_ratings, on='tmdb_id')

# Group by user_id and count the number of common movies
common_movies_count = common_movies.groupby('user_id_y').size()
print(f"Common movies distribution:")
print(common_movies_count.describe())

# Let's be less strict - find users who have rated at least 5 of the same movies as you
min_common_movies = 5
filtered_user_ids = common_movies_count[common_movies_count >= min_common_movies].index

print(f"Users who have rated at least {min_common_movies} movies in common: {len(filtered_user_ids)}")

# Filter combined_ratings for the users who have rated at least N of the same movies as you
filtered_combined_ratings = combined_ratings[combined_ratings['user_id'].isin(filtered_user_ids)]

print(f"Filtered dataset size: {len(filtered_combined_ratings)} ratings")
filtered_combined_ratings.head()

You've rated 17 movies in the combined dataset
Common movies distribution:
count    6905.000000
mean        6.589573
std         3.671925
min         1.000000
25%         4.000000
50%         6.000000
75%         9.000000
max        17.000000
dtype: float64
Users who have rated at least 5 movies in common: 4582
Common movies distribution:
count    6905.000000
mean        6.589573
std         3.671925
min         1.000000
25%         4.000000
50%         6.000000
75%         9.000000
max        17.000000
dtype: float64
Users who have rated at least 5 movies in common: 4582
Filtered dataset size: 9187461 ratings
Filtered dataset size: 9187461 ratings


Unnamed: 0,_id_x,movie_id,rating_val,user_id,_id_y,genres,image_url,imdb_id,imdb_link,movie_title,...,popularity,production_countries,release_date,runtime,spoken_languages,tmdb_id,tmdb_link,vote_average,vote_count,year_released
0,5fc57c5d6758f6963451a07f,feast-2014,7.0,deathproof,5fc880726758f69634df0bca,"[""Animation"",""Comedy"",""Drama"",""Family""]",film-poster/2/2/0/1/9/2/220192-feast-0-230-0-3...,tt3689498,http://www.imdb.com/title/tt3689498/maindetails,Feast,...,9.26,"[""United States of America""]",2014-10-25,6.0,"[""English""]",293299.0,https://www.themoviedb.org/movie/293299/,7.9,720.0,2014.0
1,5fc57c5d6758f6963451a063,loving-2016,7.0,deathproof,5fc879b26758f69634bf9665,"[""Romance"",""Drama""]",sm/upload/yp/k3/5v/2p/wzi191DNSs08gDQHHUxYwlxC...,tt4669986,http://www.imdb.com/title/tt4669986/maindetails,Loving,...,18.024,"[""United Kingdom"",""United States of America""]",2016-11-04,123.0,"[""English""]",339419.0,https://www.themoviedb.org/movie/339419/,6.7,759.0,2016.0
2,5fc57c5d6758f6963451a0ef,scripted-content,7.0,deathproof,5fc880406758f69634ddd358,"[""Comedy""]",film-poster/2/7/2/9/1/1/272911-scripted-conten...,tt4073494,http://www.imdb.com/title/tt4073494/maindetails,Scripted Content,...,1.4,"[""United States of America""]",2014-06-01,2.0,[],342914.0,https://www.themoviedb.org/movie/342914/,6.8,5.0,2014.0
3,5fc57c5d6758f6963451a060,the-future,4.0,deathproof,5fc882366758f69634eac62c,"[""Drama"",""Fantasy"",""Romance""]",film-poster/1/1/4/3/2/11432-the-future-0-230-0...,tt1235170,http://www.imdb.com/title/tt1235170/maindetails,The Future,...,5.208,"[""Germany"",""United States of America"",""France""...",2011-07-29,91.0,"[""English""]",54662.0,https://www.themoviedb.org/movie/54662/,6.0,60.0,2011.0
4,5fc57c5c6758f69634519398,mank,5.0,deathproof,5fc884286758f69634f3ceca,"[""Drama"",""History""]",film-poster/5/4/1/4/2/5/541425-mank-0-230-0-34...,tt10618286,http://www.imdb.com/title/tt10618286/maindetails,Mank,...,16.331,"[""United States of America""]",2020-11-13,132.0,"[""English""]",614560.0,https://www.themoviedb.org/movie/614560/,6.9,1077.0,2020.0


In [14]:

# Recreate the sparse matrix and compute user similarity based on the filtered data
rows = filtered_combined_ratings['user_id'].map(user_id_to_idx)
cols = filtered_combined_ratings['tmdb_id'].map(tmdb_id_to_idx)
data = filtered_combined_ratings['rating_val']
filtered_ratings_matrix = csr_matrix((data, (rows, cols)), shape=(len(user_id_to_idx), len(tmdb_id_to_idx)))
user_similarity = cosine_similarity(filtered_ratings_matrix)

In [15]:
# Find the most similar users to you
top_similar_users_indices = np.argsort(-user_similarity[0])[1:]  # Get indices of all similar users

# Create a reverse mapping from index to user_id
# user_id_to_idx maps user_id (string) to an integer index
# idx_to_user_id will map an integer index back to user_id (string)
idx_to_user_id = {v: k for k, v in user_id_to_idx.items()}

# Print the user_ids and similarities of the top 10 similar users
print("Top 10 similar users (after filtering for >20 common ratings):")
for idx in top_similar_users_indices[:10]:
    # idx is an integer index from the user_similarity matrix
    # We need to find the user_id string that corresponds to this integer index
    if idx in idx_to_user_id:
        user_id_val = idx_to_user_id[idx]
        similarity_score = user_similarity[0][idx]
        print(f"User ID: {user_id_val}, Similarity: {similarity_score:.4f}")
    else:
        # This case should ideally not happen if user_similarity matrix is consistent
        print(f"Warning: Index {idx} not found in user_id_to_idx mapping. Similarity: {user_similarity[0][idx]:.4f}")


Top 10 similar users (after filtering for >20 common ratings):
User ID: ajcohen223, Similarity: 0.2057
User ID: yvanis, Similarity: 0.1944
User ID: lilkancil, Similarity: 0.1879
User ID: egkubo, Similarity: 0.1860
User ID: yunganu, Similarity: 0.1748
User ID: japafw, Similarity: 0.1734
User ID: nudgeroni, Similarity: 0.1678
User ID: ecppineda, Similarity: 0.1666
User ID: sammythurst, Similarity: 0.1655
User ID: allebee, Similarity: 0.1624


In [None]:
# Use the filtered dataset instead of the full combined_ratings for better performance
my_rated_movies = set(my_ratings_updated['tmdb_id'])
print(f"You've rated {len(my_rated_movies)} movies")

# Initialize a dictionary to hold recommended movies and the users who recommended them
recommended_movies_details = {}

# Limit to top 100 similar users for performance
top_users = top_similar_users_indices[:100]

for user_index in top_users:
    user_id = list(user_id_to_idx.keys())[list(user_id_to_idx.values()).index(user_index)]
    
    # Use filtered dataset instead of full combined_ratings
    high_rated_movies_by_user = filtered_combined_ratings[(filtered_combined_ratings['user_id'] == user_id) & 
                                                         (filtered_combined_ratings['rating_val'] >= 7)]
    
    for _, row in high_rated_movies_by_user.iterrows():
        tmdb_id = row['tmdb_id']
        if tmdb_id not in my_rated_movies:
            if tmdb_id not in recommended_movies_details:
                recommended_movies_details[tmdb_id] = {'users': [user_id], 'ratings': [row['rating_val']]}
            else:
                recommended_movies_details[tmdb_id]['users'].append(user_id)
                recommended_movies_details[tmdb_id]['ratings'].append(row['rating_val'])

print(f"Found {len(recommended_movies_details)} potential movie recommendations")

# Limit to top 50 based on the number of users recommending the movie
recommended_movies_ids = sorted(recommended_movies_details, key=lambda x: len(recommended_movies_details[x]['users']), reverse=True)[:50]

# Fetch movie titles and stats
recommended_titles_and_stats = []
for tmdb_id in recommended_movies_ids:
    movie_matches = movies_clean[movies_clean['tmdb_id'] == tmdb_id]
    if len(movie_matches) > 0:
        movie_title = movie_matches['movie_title'].iloc[0]
        avg_rating = np.mean(recommended_movies_details[tmdb_id]['ratings'])
        num_users = len(recommended_movies_details[tmdb_id]['users'])
        recommended_titles_and_stats.append({
            'title': movie_title,
            'average_rating': avg_rating,
            'recommended_by_users_count': num_users,
        })

# Sort by average rating
recommended_titles_and_stats = sorted(recommended_titles_and_stats, key=lambda x: x['average_rating'], reverse=True)

# Display recommended movies along with stats
print(f"\nTop {len(recommended_titles_and_stats)} Movie Recommendations:")
for i, movie in enumerate(recommended_titles_and_stats, 1):
    print(f"{i:2d}. {movie['title']} - Avg Rating: {movie['average_rating']:.2f} (recommended by {movie['recommended_by_users_count']} users)")

You've rated 17 movies


In [None]:
# output to file

with open('data/recommended_movies.txt', 'w') as f:
    for movie in recommended_titles_and_stats:
        f.write(f"Title: {movie['title']}, Avg Rating: {movie['average_rating']:.2f}, Recommended by {movie['recommended_by_users_count']} Users, User IDs: {movie['recommended_by_user_ids']}\n")