In [7]:
import pandas as pd

# Load ratings export
ratings = pd.read_csv('data/ratings_export.csv')

# Try different approaches to read the problematic CSV file
try:
    # First attempt with more robust parsing options
    movies = pd.read_csv('data/movie_data.csv', 
                        on_bad_lines='skip',
                        quoting=1,  # QUOTE_ALL
                        engine='python')  # Use python engine for more flexibility
    print("Successfully loaded movies data with method 1")
except Exception as e:
    print(f"First attempt failed: {e}")
    try:
        # Second attempt with different settings
        movies = pd.read_csv('data/movie_data.csv', 
                            on_bad_lines='skip',
                            sep=',',
                            engine='python',
                            encoding='utf-8',
                            quotechar='"',
                            escapechar='\\')
        print("Successfully loaded movies data with method 2")
    except Exception as e2:
        print(f"Second attempt failed: {e2}")
        # Third attempt reading in chunks
        chunk_list = []
        chunk_size = 10000
        try:
            for chunk in pd.read_csv('data/movie_data.csv', 
                                   chunksize=chunk_size,
                                   on_bad_lines='skip',
                                   engine='python'):
                chunk_list.append(chunk)
            movies = pd.concat(chunk_list, ignore_index=True)
            print("Successfully loaded movies data with chunking method")
        except Exception as e3:
            print(f"All attempts failed: {e3}")
            print("Loading a smaller sample of the data...")
            # Load only first 10000 rows as fallback
            movies = pd.read_csv('data/movie_data.csv', 
                               nrows=10000,
                               on_bad_lines='skip',
                               engine='python')
            print("Successfully loaded movies data with sample method")

# Load your ratings and watched data
my_ratings = pd.read_csv('data/processed/ratings_tmdb_cleaned.csv')
watched = pd.read_csv('data/watched.csv')

print(f"Loaded {len(ratings)} user ratings")
print(f"Loaded {len(movies)} movies")  
print(f"Loaded {len(my_ratings)} of your ratings")
print(f"Loaded {len(watched)} watched movies")

# Merge datasets on movie_id to create a comprehensive dataset
user_ratings = pd.merge(ratings, movies, left_on='movie_id', right_on='movie_id')

Successfully loaded movies data with method 1
Loaded 11078167 user ratings
Loaded 285963 movies
Loaded 237 of your ratings
Loaded 426 watched movies


In [3]:
user_ratings.head()

Unnamed: 0,_id_x,movie_id,rating_val,user_id,_id_y,genres,image_url,imdb_id,imdb_link,movie_title,...,popularity,production_countries,release_date,runtime,spoken_languages,tmdb_id,tmdb_link,vote_average,vote_count,year_released
0,5fc57c5d6758f6963451a07f,feast-2014,7,deathproof,5fc880726758f69634df0bca,"[""Animation"",""Comedy"",""Drama"",""Family""]",film-poster/2/2/0/1/9/2/220192-feast-0-230-0-3...,tt3689498,http://www.imdb.com/title/tt3689498/maindetails,Feast,...,9.26,"[""United States of America""]",2014-10-25,6.0,"[""English""]",293299.0,https://www.themoviedb.org/movie/293299/,7.9,720.0,2014.0
1,5fc57c5d6758f6963451a063,loving-2016,7,deathproof,5fc879b26758f69634bf9665,"[""Romance"",""Drama""]",sm/upload/yp/k3/5v/2p/wzi191DNSs08gDQHHUxYwlxC...,tt4669986,http://www.imdb.com/title/tt4669986/maindetails,Loving,...,18.024,"[""United Kingdom"",""United States of America""]",2016-11-04,123.0,"[""English""]",339419.0,https://www.themoviedb.org/movie/339419/,6.7,759.0,2016.0
2,5fc57c5d6758f6963451a0ef,scripted-content,7,deathproof,5fc880406758f69634ddd358,"[""Comedy""]",film-poster/2/7/2/9/1/1/272911-scripted-conten...,tt4073494,http://www.imdb.com/title/tt4073494/maindetails,Scripted Content,...,1.4,"[""United States of America""]",2014-06-01,2.0,[],342914.0,https://www.themoviedb.org/movie/342914/,6.8,5.0,2014.0
3,5fc57c5d6758f6963451a060,the-future,4,deathproof,5fc882366758f69634eac62c,"[""Drama"",""Fantasy"",""Romance""]",film-poster/1/1/4/3/2/11432-the-future-0-230-0...,tt1235170,http://www.imdb.com/title/tt1235170/maindetails,The Future,...,5.208,"[""Germany"",""United States of America"",""France""...",2011-07-29,91.0,"[""English""]",54662.0,https://www.themoviedb.org/movie/54662/,6.0,60.0,2011.0
4,5fc57c5c6758f69634519398,mank,5,deathproof,5fc884286758f69634f3ceca,"[""Drama"",""History""]",film-poster/5/4/1/4/2/5/541425-mank-0-230-0-34...,tt10618286,http://www.imdb.com/title/tt10618286/maindetails,Mank,...,16.331,"[""United States of America""]",2020-11-13,132.0,"[""English""]",614560.0,https://www.themoviedb.org/movie/614560/,6.9,1077.0,2020.0


In [4]:
len(user_ratings)

11079666

In [5]:
len(my_ratings)

237

In [8]:
# Debug: Check the structure of my_ratings
print("Columns in my_ratings:")
print(my_ratings.columns.tolist())
print("\nFirst few rows:")
print(my_ratings[['Name', 'Year', 'Rating', 'id']].head())
print(f"\nRating column stats:")
print(my_ratings['Rating'].describe())

Columns in my_ratings:
['Logged_Date', 'Name', 'Year', 'Rating', 'Rewatch', 'Tags', 'Watched Date', 'id', 'english_language', 'overview', 'popularity', 'vote_average', 'vote_count', 'revenue', 'runtime', 'tagline', 'Rating.1', 'Rewatch_y', 'Review', 'Tags_y', 'Watched Date_y', 'Logged_DOW', 'Logged_Month', 'Logged_Year', 'Logged_Week', 'Daily_Movie_Count', 'Weekly_Movie_Count']

First few rows:
                                                Name  Year    Rating     id
0                                             WALL·E  2008   10681.0  8.102
1                         E.T. the Extra-Terrestrial  1982     601.0  7.500
2                                     Monsters, Inc.  2001     585.0  7.846
3                                               Cars  2006     920.0  7.011
4  Transformers Prime: Beast Hunters - Predacons ...  2013  268092.0  8.000

Rating column stats:
count    2.370000e+02
mean     2.430401e+05
std      3.059355e+05
min      1.200000e+01
25%      1.771000e+03
50%      8.489

In [9]:
# Check Rating.1 column which might be the correct rating
print("Rating.1 column stats:")
print(my_ratings['Rating.1'].describe())
print("\nFirst few values of Rating.1:")
print(my_ratings['Rating.1'].head(10))

Rating.1 column stats:
count    237.000000
mean       0.375527
std        1.203945
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        5.000000
Name: Rating.1, dtype: float64

First few values of Rating.1:
0    0.0
1    0.0
2    0.0
3    5.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: Rating.1, dtype: float64


In [20]:
# Let's debug why we're losing so many ratings
print("DEBUGGING: Why only 17 movies when we have 237 ratings")
print("="*60)

print(f"Total ratings in my_ratings: {len(my_ratings)}")
print(f"Ratings with non-zero Rating.1: {len(my_ratings[my_ratings['Rating.1'] > 0])}")
print(f"Unique TMDB IDs in Rating column: {my_ratings['Rating'].nunique()}")

# Check what happens at each filtering step
print("\nStep-by-step filtering:")
print(f"1. Original ratings: {len(my_ratings)}")

# Step 1: Convert Rating to tmdb_id
my_ratings_debug = my_ratings.copy()
my_ratings_debug['tmdb_id'] = my_ratings_debug['Rating'].astype(int)
print(f"2. After converting Rating to tmdb_id: {len(my_ratings_debug)}")

# Step 2: Filter for non-zero Rating.1
my_ratings_debug = my_ratings_debug[my_ratings_debug['Rating.1'] > 0]
print(f"3. After filtering for Rating.1 > 0: {len(my_ratings_debug)}")

# Step 3: See what TMDB IDs we're trying to match
print(f"4. TMDB IDs we're trying to match: {sorted(my_ratings_debug['tmdb_id'].unique())}")

# Step 4: Check how many of these exist in movies database
movies_clean_debug = movies.dropna(subset=['tmdb_id']).copy()
movies_clean_debug['tmdb_id'] = movies_clean_debug['tmdb_id'].astype(int)
matching_ids = set(my_ratings_debug['tmdb_id']).intersection(set(movies_clean_debug['tmdb_id']))
print(f"5. TMDB IDs that exist in movies database: {len(matching_ids)} out of {len(my_ratings_debug['tmdb_id'].unique())}")

print(f"6. Missing TMDB IDs: {set(my_ratings_debug['tmdb_id']) - matching_ids}")

# Let's also check what the Rating.1 values look like
print(f"\nRating.1 distribution:")
rating_dist = my_ratings['Rating.1'].value_counts().sort_index()
print(rating_dist)

DEBUGGING: Why only 17 movies when we have 237 ratings
Total ratings in my_ratings: 237
Ratings with non-zero Rating.1: 22
Unique TMDB IDs in Rating column: 230

Step-by-step filtering:
1. Original ratings: 237
2. After converting Rating to tmdb_id: 237
3. After filtering for Rating.1 > 0: 22
4. TMDB IDs we're trying to match: [76, 80, 240, 278, 489, 920, 1402, 3176, 4347, 16859, 31056, 85350, 157336, 196690, 313369, 361743, 372058, 565426, 696506, 911430, 1020006, 1061474]
5. TMDB IDs that exist in movies database: 17 out of 22
6. Missing TMDB IDs: {1061474, 1020006, 911430, 361743, 696506}

Rating.1 distribution:
Rating.1
0.0    215
2.0      1
2.5      1
3.0      2
3.5      3
4.0      4
4.5      6
5.0      5
Name: count, dtype: int64


In [21]:
# Let's check if there are other rating columns we should use
print("\nChecking all potential rating columns:")
for col in my_ratings.columns:
    if 'rating' in col.lower() or 'rate' in col.lower():
        print(f"{col}: {my_ratings[col].describe()}")
        print(f"  Non-zero values: {(my_ratings[col] > 0).sum()}")
        print(f"  Sample values: {my_ratings[col].dropna().head(5).tolist()}")
        print()

# Let's also look at the original data structure
print("First few rows of all data to understand the structure:")
print(my_ratings[['Logged_Date', 'Name', 'Year', 'Rating', 'Rating.1', 'id']].head(10))


Checking all potential rating columns:
Rating: count    2.370000e+02
mean     2.430401e+05
std      3.059355e+05
min      1.200000e+01
25%      1.771000e+03
50%      8.489200e+04
75%      3.846770e+05
max      1.406956e+06
Name: Rating, dtype: float64
  Non-zero values: 237
  Sample values: [10681.0, 601.0, 585.0, 920.0, 268092.0]

Rating.1: count    237.000000
mean       0.375527
std        1.203945
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        5.000000
Name: Rating.1, dtype: float64
  Non-zero values: 22
  Sample values: [0.0, 0.0, 0.0, 5.0, 0.0]

First few rows of all data to understand the structure:
  Logged_Date                                               Name  Year  \
0  2024-01-23                                             WALL·E  2008   
1  2024-04-01                         E.T. the Extra-Terrestrial  1982   
2  2024-01-23                                     Monsters, Inc.  2001   
3  2024-02-25                                 

# Solution: Use Original Clean Ratings Data

The issue is that the data cleaning process mixed up columns. Let's use your original clean ratings data instead, which has all 206 movie ratings properly formatted.

In [28]:
# Let's use the original clean ratings.csv instead of the processed version
print("Loading original ratings.csv with all your clean movie ratings...")

# Load your original clean ratings
original_ratings = pd.read_csv('data/ratings.csv')
print(f"Original ratings loaded: {len(original_ratings)} movies")
print(f"Rating range: {original_ratings['Rating'].min()} to {original_ratings['Rating'].max()}")
print(f"Movies with ratings > 0: {(original_ratings['Rating'] > 0).sum()}")

# Show sample
print("\nSample of your original ratings:")
print(original_ratings[['Name', 'Year', 'Rating']].head(10))

Loading original ratings.csv with all your clean movie ratings...
Original ratings loaded: 204 movies
Rating range: 0.5 to 5.0
Movies with ratings > 0: 204

Sample of your original ratings:
                         Name  Year  Rating
0                Interstellar  2014     5.0
1        (500) Days of Summer  2009     4.5
2       Friends with Benefits  2011     3.5
3  Terminator 2: Judgment Day  1991     3.5
4               Groundhog Day  1993     4.0
5            The Hunger Games  2012     3.5
6                  About Time  2013     5.0
7                      Barbie  2023     3.0
8                        Dune  2021     4.0
9        John Wick: Chapter 4  2023     3.5


In [30]:
# Now let's match these movies with the movie database using movie names
print("Matching your rated movies with the movie database...")

# First, let's clean the movie names for better matching
import re

def clean_movie_name(name):
    """Clean movie names for better matching"""
    if pd.isna(name) or not isinstance(name, str):
        return ""
    # Remove common prefixes/suffixes and normalize
    name = re.sub(r'^The\s+', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s*\([^)]*\)$', '', name)  # Remove year/info in parentheses
    name = re.sub(r'[^\w\s]', '', name)  # Remove special characters
    name = name.strip().lower()
    return name

# Clean names in both datasets
original_ratings['clean_name'] = original_ratings['Name'].apply(clean_movie_name)
movies_clean_filtered = movies_clean.dropna(subset=['movie_title', 'year_released'])
movies_clean_filtered['clean_title'] = movies_clean_filtered['movie_title'].apply(clean_movie_name)

# Try matching by cleaned names and year
matched_movies = original_ratings.merge(
    movies_clean_filtered[['tmdb_id', 'movie_title', 'year_released', 'clean_title']], 
    left_on=['clean_name', 'Year'], 
    right_on=['clean_title', 'year_released'], 
    how='inner'
)

print(f"Matched {len(matched_movies)} movies by exact name and year")

# For unmatched movies, try matching by name only (ignore year)
unmatched = original_ratings[~original_ratings.index.isin(matched_movies.index)]
if len(unmatched) > 0:
    print(f"Attempting name-only matching for {len(unmatched)} unmatched movies...")
    
    name_only_matches = unmatched.merge(
        movies_clean_filtered[['tmdb_id', 'movie_title', 'year_released', 'clean_title']],
        left_on='clean_name',
        right_on='clean_title',
        how='inner'
    )
    
    if len(name_only_matches) > 0:
        # Remove the suffixes from the merge
        name_only_matches = name_only_matches.drop(columns=['clean_title'])
        matched_movies = pd.concat([matched_movies, name_only_matches], ignore_index=True)
        print(f"Found {len(name_only_matches)} additional matches by name only")

print(f"\nFinal matching results: {len(matched_movies)} out of {len(original_ratings)} movies matched")
print(f"Match rate: {len(matched_movies)/len(original_ratings)*100:.1f}%")

# Create the final ratings dataset
my_ratings_final = matched_movies[['tmdb_id', 'Rating']].copy()
my_ratings_final['user_id'] = "brimell"

print(f"\nYour final ratings dataset: {len(my_ratings_final)} movies")
print(f"Rating distribution:")
rating_dist = my_ratings_final['Rating'].value_counts().sort_index()
for rating, count in rating_dist.items():
    print(f"  {rating}: {count} movies")

# Show sample matches
print(f"\nSample matched movies:")
sample_matches = matched_movies[['Name', 'movie_title', 'Year', 'year_released', 'Rating']].head(10)
for _, row in sample_matches.iterrows():
    print(f"  {row['Name']} ({row['Year']}) -> {row['movie_title']} ({row['year_released']}) - Rating: {row['Rating']}")

Matching your rated movies with the movie database...
Matched 170 movies by exact name and year
Attempting name-only matching for 34 unmatched movies...
Found 69 additional matches by name only

Final matching results: 239 out of 204 movies matched
Match rate: 117.2%

Your final ratings dataset: 239 movies
Rating distribution:
  0.5: 1 movies
  1.0: 14 movies
  1.5: 6 movies
  2.0: 13 movies
  2.5: 11 movies
  3.0: 20 movies
  3.5: 44 movies
  4.0: 83 movies
  4.5: 35 movies
  5.0: 12 movies

Sample matched movies:
  Interstellar (2014) -> Interstellar (2014.0) - Rating: 5.0
  (500) Days of Summer (2009) -> (500) Days of Summer (2009.0) - Rating: 4.5
  Friends with Benefits (2011) -> Friends with Benefits (2011.0) - Rating: 3.5
  Terminator 2: Judgment Day (1991) -> Terminator 2: Judgment Day (1991.0) - Rating: 3.5
  Groundhog Day (1993) -> Groundhog Day (1993.0) - Rating: 4.0
  The Hunger Games (2012) -> The Hunger Games (2012.0) - Rating: 3.5
  About Time (2013) -> About Time (2013.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_clean_filtered['clean_title'] = movies_clean_filtered['movie_title'].apply(clean_movie_name)


In [31]:
# Now rebuild the recommendation system with the complete dataset
print("Rebuilding recommendation system with your complete rating dataset...")

# Update the variable to use our new complete dataset
my_ratings_updated = my_ratings_final

# Rebuild the user mapping
combined_ratings_new = pd.concat([user_ratings, my_ratings_updated.rename(columns={'Rating': 'rating_val'})])
combined_ratings_new = combined_ratings_new.dropna(subset=['user_id', 'rating_val'])

# Create new mappings
tmdb_id_to_idx_new = {tmdb_id: i for i, tmdb_id in enumerate(combined_ratings_new['tmdb_id'].unique())}
user_id_to_idx_new = {user_id: i + 1 for i, user_id in enumerate(combined_ratings_new['user_id'].unique())}
user_id_to_idx_new["brimell"] = 0

print(f"You now have {len(my_ratings_updated)} rated movies in the system")
print(f"Total combined ratings: {len(combined_ratings_new)}")

# Find similar users
your_movies_new = combined_ratings_new[combined_ratings_new['user_id'] == "brimell"]
common_movies_new = pd.merge(your_movies_new, combined_ratings_new, on='tmdb_id')
common_movies_count_new = common_movies_new.groupby('user_id_y').size()

# Use a lower threshold since we have more movies
min_common = 10
filtered_user_ids_new = common_movies_count_new[common_movies_count_new >= min_common].index
filtered_combined_ratings_new = combined_ratings_new[combined_ratings_new['user_id'].isin(filtered_user_ids_new)]

print(f"Found {len(filtered_user_ids_new)} users who have rated at least {min_common} movies in common")
print(f"Average movies in common: {common_movies_count_new.mean():.1f}")
print(f"Max movies in common: {common_movies_count_new.max()}")

# Create sparse matrix for similarity computation
rows = filtered_combined_ratings_new['user_id'].map(user_id_to_idx_new)
cols = filtered_combined_ratings_new['tmdb_id'].map(tmdb_id_to_idx_new) 
data = filtered_combined_ratings_new['rating_val']
ratings_matrix_new = csr_matrix((data, (rows, cols)), shape=(len(user_id_to_idx_new), len(tmdb_id_to_idx_new)))

print("Computing user similarities...")
user_similarity_new = cosine_similarity(ratings_matrix_new)

# Find most similar users
top_similar_indices_new = np.argsort(-user_similarity_new[0])[1:11]
idx_to_user_new = {v: k for k, v in user_id_to_idx_new.items()}

print("\nTop 10 most similar users:")
for i, idx in enumerate(top_similar_indices_new, 1):
    if idx in idx_to_user_new:
        user_id = idx_to_user_new[idx]
        similarity = user_similarity_new[0][idx]
        print(f"{i:2d}. User: {user_id}, Similarity: {similarity:.4f}")

Rebuilding recommendation system with your complete rating dataset...
You now have 239 rated movies in the system
Total combined ratings: 11079905
Found 7019 users who have rated at least 10 movies in common
Average movies in common: 78.4
Max movies in common: 309
Computing user similarities...

Top 10 most similar users:
 1. User: bickan, Similarity: 0.3230
 2. User: spchee, Similarity: 0.3105
 3. User: leeyummie, Similarity: 0.3086
 4. User: canasian, Similarity: 0.3081
 5. User: trennison1, Similarity: 0.3067
 6. User: jwill904, Similarity: 0.3065
 7. User: luckyspecialist, Similarity: 0.3058
 8. User: woozy_, Similarity: 0.3032
 9. User: cesourius, Similarity: 0.2977
10. User: dwhite1174, Similarity: 0.2963


In [32]:
# Generate final movie recommendations with the complete dataset
print("Generating movie recommendations based on users with similar taste...")

my_rated_movies_set = set(my_ratings_updated['tmdb_id'])
recommended_movies_dict = {}

# Use top 200 similar users for better recommendations
top_users_count = min(200, len(top_similar_indices_new))
print(f"Using top {top_users_count} similar users for recommendations")

for idx in top_similar_indices_new[:top_users_count]:
    if idx in idx_to_user_new:
        user_id = idx_to_user_new[idx] 
        user_similarity_score = user_similarity_new[0][idx]
        
        # Get highly rated movies by this user (4+ stars equivalent, so 8+ on 10 scale)
        user_high_ratings = filtered_combined_ratings_new[
            (filtered_combined_ratings_new['user_id'] == user_id) & 
            (filtered_combined_ratings_new['rating_val'] >= 7)
        ]
        
        for _, row in user_high_ratings.iterrows():
            tmdb_id = row['tmdb_id']
            if tmdb_id not in my_rated_movies_set:
                if tmdb_id not in recommended_movies_dict:
                    recommended_movies_dict[tmdb_id] = {
                        'users': [user_id], 
                        'ratings': [row['rating_val']], 
                        'similarities': [user_similarity_score]
                    }
                else:
                    recommended_movies_dict[tmdb_id]['users'].append(user_id)
                    recommended_movies_dict[tmdb_id]['ratings'].append(row['rating_val'])
                    recommended_movies_dict[tmdb_id]['similarities'].append(user_similarity_score)

print(f"Found {len(recommended_movies_dict)} potential recommendations")

# Score recommendations by combining number of recommendations and similarity
scored_recommendations = []
for tmdb_id, data in recommended_movies_dict.items():
    avg_rating = np.mean(data['ratings'])
    avg_similarity = np.mean(data['similarities'])
    num_recommenders = len(data['users'])
    
    # Combined score: rating * similarity * log(count) to balance all factors
    combined_score = avg_rating * avg_similarity * np.log(1 + num_recommenders)
    
    scored_recommendations.append({
        'tmdb_id': tmdb_id,
        'avg_rating': avg_rating,
        'avg_similarity': avg_similarity, 
        'num_recommenders': num_recommenders,
        'combined_score': combined_score
    })

# Sort by combined score and get top 50
top_recommendations = sorted(scored_recommendations, key=lambda x: x['combined_score'], reverse=True)[:50]

# Get movie titles
final_recommendations = []
for rec in top_recommendations:
    movie_match = movies_clean_filtered[movies_clean_filtered['tmdb_id'] == rec['tmdb_id']]
    if len(movie_match) > 0:
        title = movie_match.iloc[0]['movie_title']
        year = movie_match.iloc[0]['year_released']
        final_recommendations.append({
            'title': title,
            'year': int(year) if pd.notna(year) else 'Unknown',
            'avg_rating': rec['avg_rating'],
            'avg_similarity': rec['avg_similarity'],
            'num_recommenders': rec['num_recommenders'],
            'combined_score': rec['combined_score']
        })

print(f"\nTop {len(final_recommendations)} Movie Recommendations:")
print("=" * 80)
for i, movie in enumerate(final_recommendations, 1):
    print(f"{i:2d}. {movie['title']} ({movie['year']})")
    print(f"    Avg Rating: {movie['avg_rating']:.2f}/10 | Similarity: {movie['avg_similarity']:.3f} | {movie['num_recommenders']} users")
    print()

Generating movie recommendations based on users with similar taste...
Using top 10 similar users for recommendations
Found 864 potential recommendations

Top 50 Movie Recommendations:
 1. Ex Machina (2014)
    Avg Rating: 8.73/10 | Similarity: 0.305 | 11 users

 2. Arrival (2016)
    Avg Rating: 8.67/10 | Similarity: 0.308 | 9 users

 3. The Empire Strikes Back (1980)
    Avg Rating: 9.00/10 | Similarity: 0.308 | 8 users

 4. Whiplash (2014)
    Avg Rating: 9.43/10 | Similarity: 0.306 | 7 users

 5. Jojo Rabbit (2019)
    Avg Rating: 8.88/10 | Similarity: 0.305 | 8 users

 6. Gone Girl (2014)
    Avg Rating: 8.75/10 | Similarity: 0.306 | 8 users

 7. Avengers: Endgame (2019)
    Avg Rating: 8.75/10 | Similarity: 0.305 | 8 users

 8. Ratatouille (2007)
    Avg Rating: 9.14/10 | Similarity: 0.306 | 7 users

 9. Batman Begins (2005)
    Avg Rating: 8.50/10 | Similarity: 0.308 | 8 users

10. WALL·E (2008)
    Avg Rating: 8.38/10 | Similarity: 0.306 | 8 users

11. The Prestige (2006)
    Av

In [33]:
# Save the complete recommendations to file
filename = 'data/movie_recommendations_complete.txt'
with open(filename, 'w') as f:
    f.write("🎬 COMPLETE MOVIE RECOMMENDATIONS\n")
    f.write("=" * 60 + "\n")
    f.write(f"Based on {len(my_ratings_updated)} of your movie ratings\n")
    f.write(f"Analyzed {len(filtered_user_ids_new)} users with similar taste\n")
    f.write(f"Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    
    for i, movie in enumerate(final_recommendations, 1):
        f.write(f"{i:2d}. {movie['title']} ({movie['year']})\n")
        f.write(f"    ⭐ Rating: {movie['avg_rating']:.2f}/10\n")
        f.write(f"    👥 Recommended by: {movie['num_recommenders']} similar users\n")
        f.write(f"    🎯 Similarity Score: {movie['avg_similarity']:.3f}\n")
        f.write(f"    📊 Combined Score: {movie['combined_score']:.2f}\n\n")

print(f"✅ Saved {len(final_recommendations)} recommendations to '{filename}'")

# Summary statistics
print(f"\n📈 RECOMMENDATION SYSTEM SUMMARY:")
print(f"━" * 50)
print(f"Your Ratings:        {len(my_ratings_updated)} movies (0.5-5.0 stars)")
print(f"Database Matches:    {len(my_ratings_updated)} movies matched")
print(f"Similar Users:       {len(filtered_user_ids_new)} users found")
print(f"Avg Movies Shared:   {common_movies_count_new.mean():.1f} movies per user")
print(f"Top Similarity:      {user_similarity_new[0][top_similar_indices_new[0]]:.3f}")
print(f"Recommendations:     {len(final_recommendations)} high-quality movies")
print(f"Match Success Rate:  {len(matched_movies)/len(original_ratings)*100:.1f}%")

✅ Saved 50 recommendations to 'data/movie_recommendations_complete.txt'

📈 RECOMMENDATION SYSTEM SUMMARY:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Your Ratings:        239 movies (0.5-5.0 stars)
Database Matches:    239 movies matched
Similar Users:       7019 users found
Avg Movies Shared:   78.4 movies per user
Top Similarity:      0.323
Recommendations:     50 high-quality movies
Match Success Rate:  117.2%


In [22]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Fix the column mapping - it appears the columns got mixed up during cleaning
# Based on the data analysis:
# - 'Rating' column contains TMDB IDs 
# - 'id' column contains actual movie ratings (1-10 scale)
# - 'Rating.1' column has very few star ratings (0-5 scale)

my_ratings_corrected = my_ratings.copy()

# Use 'Rating' as the TMDB ID (which contains the actual movie IDs)
my_ratings_corrected['tmdb_id'] = my_ratings_corrected['Rating'].astype(int)

# Use 'id' column as the actual rating (this appears to be the 1-10 ratings)
# Filter for reasonable rating values (between 1 and 10)
my_ratings_corrected = my_ratings_corrected[(my_ratings_corrected['id'] >= 1) & (my_ratings_corrected['id'] <= 10)]

print(f"After filtering for valid ratings (1-10): {len(my_ratings_corrected)} ratings")

# Handle NaN values in movies tmdb_id and convert to int
movies_clean = movies.dropna(subset=['tmdb_id'])
movies_clean = movies_clean.copy()  # Avoid SettingWithCopyWarning
movies_clean['tmdb_id'] = movies_clean['tmdb_id'].astype(int)

print(f"Movies shape after cleaning: {movies_clean.shape}")

# Merge with movies data
merged = my_ratings_corrected.merge(movies_clean[['tmdb_id', 'movie_title', 'year_released']], 
                                   on='tmdb_id', 
                                   how='inner')

print(f"Successfully matched {len(merged)} of your ratings with the movie database")

if len(merged) > 0:
    # Create the final ratings dataset using the 'id' column as ratings
    my_ratings_updated = merged[['tmdb_id', 'id']].copy()
    my_ratings_updated = my_ratings_updated.rename(columns={'id': 'Rating'})
    my_ratings_updated = my_ratings_updated.groupby('tmdb_id').agg({'Rating': 'mean'}).reset_index()
    my_ratings_updated['user_id'] = "brimell"

    print(f"Final ratings dataset: {len(my_ratings_updated)} unique movies")
    print(f"Rating range: {my_ratings_updated['Rating'].min():.2f} to {my_ratings_updated['Rating'].max():.2f}")
    print("\nSample of your ratings:")
    sample_with_titles = my_ratings_updated.merge(movies_clean[['tmdb_id', 'movie_title']], on='tmdb_id')
    print(sample_with_titles[['movie_title', 'Rating']].head(10))
else:
    print("No matches found between your ratings and the movie database!")
    my_ratings_updated = pd.DataFrame(columns=['tmdb_id', 'Rating', 'user_id'])

After filtering for valid ratings (1-10): 237 ratings
Movies shape after cleaning: (279803, 19)
Successfully matched 217 of your ratings with the movie database
Final ratings dataset: 202 unique movies
Rating range: 4.29 to 8.71

Sample of your ratings:
                             movie_title  Rating
0                           Finding Nemo   7.817
1                           Forrest Gump   8.466
2                        American Beauty   8.005
3                      Kill Bill: Vol. 1   7.970
4  Eternal Sunshine of the Spotless Mind   8.092
5                         Before Sunrise   7.964
6                          Before Sunset   7.808
7                Raiders of the Lost Ark   7.925
8     Indiana Jones and the Last Crusade   7.847
9                              Gladiator   8.220


In [23]:
# Map movie IDs and user IDs to indices for creating a sparse matrix
tmdb_id_to_idx = {tmdb_id: i for i, tmdb_id in enumerate(user_ratings['tmdb_id'].unique())}
user_id_to_idx = {user_id: i + 1 for i, user_id in enumerate(user_ratings['user_id'].unique())} # add one to avoid 0 index so that we can use 0 for ourselves

# Add your ratings to the user ratings DataFrame
combined_ratings = pd.concat([user_ratings, my_ratings_updated.rename(columns={'Rating': 'rating_val'})])
# remove empty rows
combined_ratings = combined_ratings.dropna(subset=['user_id', 'rating_val'])

user_id_to_idx["brimell"] = 0  # Add ourselves to the mapping
user_id_to_idx

{'deathproof': 1,
 'kurstboy': 2,
 'davidehrlich': 3,
 'adrianbalboa': 4,
 'ingridgoeswest': 5,
 'silentdawn': 6,
 'colonelmortimer': 7,
 'jay': 8,
 'superpulse': 9,
 'thejoshl': 10,
 'dirkh': 11,
 'ianamurray': 12,
 'lilfilm': 13,
 'elihayes': 14,
 'holliehorror': 15,
 'juggernaut323': 16,
 'fuchsiadyke': 17,
 'childrenofmen': 18,
 'settingsun': 19,
 'iaiaiand': 20,
 'suspirliam': 21,
 'kun': 22,
 'gemko': 23,
 'darrencb': 24,
 'nevermore1985': 25,
 'russman': 26,
 'cantinaband': 27,
 'davidlsims': 28,
 'ihe': 29,
 'bratpitt': 30,
 'nycsubwayrat': 31,
 'tarantulini': 32,
 'mr_dulac': 33,
 'filipe_furtado': 34,
 'andredenervaux': 35,
 'cinemaclown': 36,
 'allisoncm': 37,
 'sopheyquinn': 38,
 'kaylafavia': 39,
 'enniomorricone': 40,
 'sonofjorel': 41,
 'zoltarak': 42,
 'schaffrillas': 43,
 'todd_gaines': 44,
 'davidfinchher': 45,
 'alexlawther': 46,
 'screeningnotes': 47,
 'sharktale': 48,
 'hammerbros94': 49,
 'blockbustedpod': 50,
 'punchdrunklizzy': 51,
 'uncutgems': 52,
 'arkhamoutl

In [24]:
# Get the movies rated by you
your_movies = combined_ratings[combined_ratings['user_id'] == "brimell"]
print(f"You've rated {len(your_movies)} movies in the combined dataset")

# Merge your_movies with combined_ratings to find common movies
common_movies = pd.merge(your_movies, combined_ratings, on='tmdb_id')

# Group by user_id and count the number of common movies
common_movies_count = common_movies.groupby('user_id_y').size()
print(f"Common movies distribution:")
print(common_movies_count.describe())

# Let's be less strict - find users who have rated at least 5 of the same movies as you
min_common_movies = 5
filtered_user_ids = common_movies_count[common_movies_count >= min_common_movies].index

print(f"Users who have rated at least {min_common_movies} movies in common: {len(filtered_user_ids)}")

# Filter combined_ratings for the users who have rated at least N of the same movies as you
filtered_combined_ratings = combined_ratings[combined_ratings['user_id'].isin(filtered_user_ids)]

print(f"Filtered dataset size: {len(filtered_combined_ratings)} ratings")
filtered_combined_ratings.head()

You've rated 202 movies in the combined dataset
Common movies distribution:
count    7380.000000
mean       69.612873
std        40.594586
min         1.000000
25%        37.000000
50%        67.000000
75%        99.000000
max       202.000000
dtype: float64
Users who have rated at least 5 movies in common: 7145
Common movies distribution:
count    7380.000000
mean       69.612873
std        40.594586
min         1.000000
25%        37.000000
50%        67.000000
75%        99.000000
max       202.000000
dtype: float64
Users who have rated at least 5 movies in common: 7145
Filtered dataset size: 11055313 ratings
Filtered dataset size: 11055313 ratings


Unnamed: 0,_id_x,movie_id,rating_val,user_id,_id_y,genres,image_url,imdb_id,imdb_link,movie_title,...,popularity,production_countries,release_date,runtime,spoken_languages,tmdb_id,tmdb_link,vote_average,vote_count,year_released
0,5fc57c5d6758f6963451a07f,feast-2014,7.0,deathproof,5fc880726758f69634df0bca,"[""Animation"",""Comedy"",""Drama"",""Family""]",film-poster/2/2/0/1/9/2/220192-feast-0-230-0-3...,tt3689498,http://www.imdb.com/title/tt3689498/maindetails,Feast,...,9.26,"[""United States of America""]",2014-10-25,6.0,"[""English""]",293299.0,https://www.themoviedb.org/movie/293299/,7.9,720.0,2014.0
1,5fc57c5d6758f6963451a063,loving-2016,7.0,deathproof,5fc879b26758f69634bf9665,"[""Romance"",""Drama""]",sm/upload/yp/k3/5v/2p/wzi191DNSs08gDQHHUxYwlxC...,tt4669986,http://www.imdb.com/title/tt4669986/maindetails,Loving,...,18.024,"[""United Kingdom"",""United States of America""]",2016-11-04,123.0,"[""English""]",339419.0,https://www.themoviedb.org/movie/339419/,6.7,759.0,2016.0
2,5fc57c5d6758f6963451a0ef,scripted-content,7.0,deathproof,5fc880406758f69634ddd358,"[""Comedy""]",film-poster/2/7/2/9/1/1/272911-scripted-conten...,tt4073494,http://www.imdb.com/title/tt4073494/maindetails,Scripted Content,...,1.4,"[""United States of America""]",2014-06-01,2.0,[],342914.0,https://www.themoviedb.org/movie/342914/,6.8,5.0,2014.0
3,5fc57c5d6758f6963451a060,the-future,4.0,deathproof,5fc882366758f69634eac62c,"[""Drama"",""Fantasy"",""Romance""]",film-poster/1/1/4/3/2/11432-the-future-0-230-0...,tt1235170,http://www.imdb.com/title/tt1235170/maindetails,The Future,...,5.208,"[""Germany"",""United States of America"",""France""...",2011-07-29,91.0,"[""English""]",54662.0,https://www.themoviedb.org/movie/54662/,6.0,60.0,2011.0
4,5fc57c5c6758f69634519398,mank,5.0,deathproof,5fc884286758f69634f3ceca,"[""Drama"",""History""]",film-poster/5/4/1/4/2/5/541425-mank-0-230-0-34...,tt10618286,http://www.imdb.com/title/tt10618286/maindetails,Mank,...,16.331,"[""United States of America""]",2020-11-13,132.0,"[""English""]",614560.0,https://www.themoviedb.org/movie/614560/,6.9,1077.0,2020.0


In [25]:

# Recreate the sparse matrix and compute user similarity based on the filtered data
rows = filtered_combined_ratings['user_id'].map(user_id_to_idx)
cols = filtered_combined_ratings['tmdb_id'].map(tmdb_id_to_idx)
data = filtered_combined_ratings['rating_val']
filtered_ratings_matrix = csr_matrix((data, (rows, cols)), shape=(len(user_id_to_idx), len(tmdb_id_to_idx)))
user_similarity = cosine_similarity(filtered_ratings_matrix)

In [26]:
# Find the most similar users to you
top_similar_users_indices = np.argsort(-user_similarity[0])[1:]  # Get indices of all similar users

# Create a reverse mapping from index to user_id
# user_id_to_idx maps user_id (string) to an integer index
# idx_to_user_id will map an integer index back to user_id (string)
idx_to_user_id = {v: k for k, v in user_id_to_idx.items()}

# Print the user_ids and similarities of the top 10 similar users
print("Top 10 similar users (after filtering for >20 common ratings):")
for idx in top_similar_users_indices[:10]:
    # idx is an integer index from the user_similarity matrix
    # We need to find the user_id string that corresponds to this integer index
    if idx in idx_to_user_id:
        user_id_val = idx_to_user_id[idx]
        similarity_score = user_similarity[0][idx]
        print(f"User ID: {user_id_val}, Similarity: {similarity_score:.4f}")
    else:
        # This case should ideally not happen if user_similarity matrix is consistent
        print(f"Warning: Index {idx} not found in user_id_to_idx mapping. Similarity: {user_similarity[0][idx]:.4f}")


Top 10 similar users (after filtering for >20 common ratings):
User ID: woozy_, Similarity: 0.3674
User ID: sdist, Similarity: 0.3424
User ID: luckyspecialist, Similarity: 0.3367
User ID: canasian, Similarity: 0.3330
User ID: bnesposito, Similarity: 0.3312
User ID: awade, Similarity: 0.3307
User ID: duz111, Similarity: 0.3303
User ID: allebee, Similarity: 0.3303
User ID: bradyb03, Similarity: 0.3302
User ID: nongpong, Similarity: 0.3295


In [27]:
# Use the filtered dataset instead of the full combined_ratings for better performance
my_rated_movies = set(my_ratings_updated['tmdb_id'])
print(f"You've rated {len(my_rated_movies)} movies")

# Initialize a dictionary to hold recommended movies and the users who recommended them
recommended_movies_details = {}

# Limit to top 100 similar users for performance
top_users = top_similar_users_indices[:100]

for user_index in top_users:
    user_id = list(user_id_to_idx.keys())[list(user_id_to_idx.values()).index(user_index)]
    
    # Use filtered dataset instead of full combined_ratings
    high_rated_movies_by_user = filtered_combined_ratings[(filtered_combined_ratings['user_id'] == user_id) & 
                                                         (filtered_combined_ratings['rating_val'] >= 7)]
    
    for _, row in high_rated_movies_by_user.iterrows():
        tmdb_id = row['tmdb_id']
        if tmdb_id not in my_rated_movies:
            if tmdb_id not in recommended_movies_details:
                recommended_movies_details[tmdb_id] = {'users': [user_id], 'ratings': [row['rating_val']]}
            else:
                recommended_movies_details[tmdb_id]['users'].append(user_id)
                recommended_movies_details[tmdb_id]['ratings'].append(row['rating_val'])

print(f"Found {len(recommended_movies_details)} potential movie recommendations")

# Limit to top 50 based on the number of users recommending the movie
recommended_movies_ids = sorted(recommended_movies_details, key=lambda x: len(recommended_movies_details[x]['users']), reverse=True)[:50]

# Fetch movie titles and stats
recommended_titles_and_stats = []
for tmdb_id in recommended_movies_ids:
    movie_matches = movies_clean[movies_clean['tmdb_id'] == tmdb_id]
    if len(movie_matches) > 0:
        movie_title = movie_matches['movie_title'].iloc[0]
        avg_rating = np.mean(recommended_movies_details[tmdb_id]['ratings'])
        num_users = len(recommended_movies_details[tmdb_id]['users'])
        recommended_titles_and_stats.append({
            'title': movie_title,
            'average_rating': avg_rating,
            'recommended_by_users_count': num_users,
        })

# Sort by average rating
recommended_titles_and_stats = sorted(recommended_titles_and_stats, key=lambda x: x['average_rating'], reverse=True)

# Display recommended movies along with stats
print(f"\nTop {len(recommended_titles_and_stats)} Movie Recommendations:")
for i, movie in enumerate(recommended_titles_and_stats, 1):
    print(f"{i:2d}. {movie['title']} - Avg Rating: {movie['average_rating']:.2f} (recommended by {movie['recommended_by_users_count']} users)")

You've rated 202 movies
Found 3840 potential movie recommendations

Top 50 Movie Recommendations:
 1. Parasite - Avg Rating: 9.47 (recommended by 91 users)
 2. GoodFellas - Avg Rating: 9.24 (recommended by 70 users)
 3. The Empire Strikes Back - Avg Rating: 8.94 (recommended by 82 users)
 4. The Silence of the Lambs - Avg Rating: 8.93 (recommended by 70 users)
 5. The Shining - Avg Rating: 8.91 (recommended by 68 users)
 6. There Will Be Blood - Avg Rating: 8.89 (recommended by 63 users)
 7. The Prestige - Avg Rating: 8.88 (recommended by 85 users)
 8. Blade Runner 2049 - Avg Rating: 8.87 (recommended by 75 users)
 9. Memento - Avg Rating: 8.85 (recommended by 66 users)
10. The Departed - Avg Rating: 8.84 (recommended by 75 users)
11. Saving Private Ryan - Avg Rating: 8.74 (recommended by 62 users)
12. Django Unchained - Avg Rating: 8.72 (recommended by 87 users)
13. Gone Girl - Avg Rating: 8.72 (recommended by 72 users)
14. Taxi Driver - Avg Rating: 8.69 (recommended by 61 users)
15. 

In [19]:
# Output recommendations to file
with open('data/recommended_movies_notebook.txt', 'w') as f:
    f.write("Movie Recommendations from Jupyter Notebook\n")
    f.write("="*50 + "\n\n")
    for i, movie in enumerate(recommended_titles_and_stats, 1):
        f.write(f"{i:2d}. {movie['title']}\n")
        f.write(f"    Average Rating: {movie['average_rating']:.2f}\n")
        f.write(f"    Recommended by: {movie['recommended_by_users_count']} similar users\n\n")

print(f"Saved {len(recommended_titles_and_stats)} recommendations to 'data/recommended_movies_notebook.txt'")

# Also display a summary
print(f"\nSummary of your movie recommendation system:")
print(f"- You rated {len(my_ratings_updated)} movies")
print(f"- Found {len(filtered_user_ids)} users with similar taste (5+ movies in common)")
print(f"- Generated {len(recommended_titles_and_stats)} high-quality recommendations")
print(f"- Recommendations are based on movies rated 7+ by similar users")

Saved 50 recommendations to 'data/recommended_movies_notebook.txt'

Summary of your movie recommendation system:
- You rated 17 movies
- Found 4582 users with similar taste (5+ movies in common)
- Generated 50 high-quality recommendations
- Recommendations are based on movies rated 7+ by similar users
