In [3]:
import pandas as pd

# Load ratings export
ratings = pd.read_csv('data/ratings_export.csv')

# Try different approaches to read the problematic CSV file
try:
    # First attempt with more robust parsing options
    movies = pd.read_csv('data/movie_data.csv', 
                        on_bad_lines='skip',
                        quoting=1,  # QUOTE_ALL
                        engine='python')  # Use python engine for more flexibility
    print("Successfully loaded movies data with method 1")
except Exception as e:
    print(f"First attempt failed: {e}")
    try:
        # Second attempt with different settings
        movies = pd.read_csv('data/movie_data.csv', 
                            on_bad_lines='skip',
                            sep=',',
                            engine='python',
                            encoding='utf-8',
                            quotechar='"',
                            escapechar='\\')
        print("Successfully loaded movies data with method 2")
    except Exception as e2:
        print(f"Second attempt failed: {e2}")
        # Third attempt reading in chunks
        chunk_list = []
        chunk_size = 10000
        try:
            for chunk in pd.read_csv('data/movie_data.csv', 
                                   chunksize=chunk_size,
                                   on_bad_lines='skip',
                                   engine='python'):
                chunk_list.append(chunk)
            movies = pd.concat(chunk_list, ignore_index=True)
            print("Successfully loaded movies data with chunking method")
        except Exception as e3:
            print(f"All attempts failed: {e3}")
            print("Loading a smaller sample of the data...")
            # Load only first 10000 rows as fallback
            movies = pd.read_csv('data/movie_data.csv', 
                               nrows=10000,
                               on_bad_lines='skip',
                               engine='python')
            print("Successfully loaded movies data with sample method")

# Load your ratings (this should work now that we've created the cleaned file)
my_ratings = pd.read_csv('data/processed/ratings_tmdb_cleaned.csv')

print(f"Loaded {len(ratings)} user ratings")
print(f"Loaded {len(movies)} movies")  
print(f"Loaded {len(my_ratings)} of your ratings")

# Merge datasets on movie_id to create a comprehensive dataset
user_ratings = pd.merge(ratings, movies, left_on='movie_id', right_on='movie_id')

Successfully loaded movies data with method 1
Loaded 11078167 user ratings
Loaded 285963 movies
Loaded 237 of your ratings


In [4]:
user_ratings.head()

Unnamed: 0,_id_x,movie_id,rating_val,user_id,_id_y,genres,image_url,imdb_id,imdb_link,movie_title,...,popularity,production_countries,release_date,runtime,spoken_languages,tmdb_id,tmdb_link,vote_average,vote_count,year_released
0,5fc57c5d6758f6963451a07f,feast-2014,7,deathproof,5fc880726758f69634df0bca,"[""Animation"",""Comedy"",""Drama"",""Family""]",film-poster/2/2/0/1/9/2/220192-feast-0-230-0-3...,tt3689498,http://www.imdb.com/title/tt3689498/maindetails,Feast,...,9.26,"[""United States of America""]",2014-10-25,6.0,"[""English""]",293299.0,https://www.themoviedb.org/movie/293299/,7.9,720.0,2014.0
1,5fc57c5d6758f6963451a063,loving-2016,7,deathproof,5fc879b26758f69634bf9665,"[""Romance"",""Drama""]",sm/upload/yp/k3/5v/2p/wzi191DNSs08gDQHHUxYwlxC...,tt4669986,http://www.imdb.com/title/tt4669986/maindetails,Loving,...,18.024,"[""United Kingdom"",""United States of America""]",2016-11-04,123.0,"[""English""]",339419.0,https://www.themoviedb.org/movie/339419/,6.7,759.0,2016.0
2,5fc57c5d6758f6963451a0ef,scripted-content,7,deathproof,5fc880406758f69634ddd358,"[""Comedy""]",film-poster/2/7/2/9/1/1/272911-scripted-conten...,tt4073494,http://www.imdb.com/title/tt4073494/maindetails,Scripted Content,...,1.4,"[""United States of America""]",2014-06-01,2.0,[],342914.0,https://www.themoviedb.org/movie/342914/,6.8,5.0,2014.0
3,5fc57c5d6758f6963451a060,the-future,4,deathproof,5fc882366758f69634eac62c,"[""Drama"",""Fantasy"",""Romance""]",film-poster/1/1/4/3/2/11432-the-future-0-230-0...,tt1235170,http://www.imdb.com/title/tt1235170/maindetails,The Future,...,5.208,"[""Germany"",""United States of America"",""France""...",2011-07-29,91.0,"[""English""]",54662.0,https://www.themoviedb.org/movie/54662/,6.0,60.0,2011.0
4,5fc57c5c6758f69634519398,mank,5,deathproof,5fc884286758f69634f3ceca,"[""Drama"",""History""]",film-poster/5/4/1/4/2/5/541425-mank-0-230-0-34...,tt10618286,http://www.imdb.com/title/tt10618286/maindetails,Mank,...,16.331,"[""United States of America""]",2020-11-13,132.0,"[""English""]",614560.0,https://www.themoviedb.org/movie/614560/,6.9,1077.0,2020.0


In [5]:
len(user_ratings)

11079666

In [6]:
len(my_ratings)

237

In [7]:
# Filter users who have rated Interstellar 10/10
interstellar_10 = user_ratings[(user_ratings['movie_title'] == 'Interstellar') & (user_ratings['rating_val'] == 10)]

# Filter users who have rated La La Land 10/10
la_la_land_10 = user_ratings[(user_ratings['movie_title'] == 'La La Land') & (user_ratings['rating_val'] == 10)]

about_time_10 = user_ratings[(user_ratings['movie_title'] == 'About Time') & (user_ratings['rating_val'] == 10)]

# Get user_ids who rated Interstellar 10/10
interstellar_users_10 = interstellar_10['user_id']

# Get user_ids who rated La La Land 10/10
la_la_land_users_10 = la_la_land_10['user_id']

# Find common users who rated both movies 10/10
common_users_10 = interstellar_users_10[interstellar_users_10.isin(la_la_land_users_10)]
common_users_10 = common_users_10[common_users_10.isin(about_time_10['user_id'])]

print(common_users_10)

161739            niceguys
216331            mskiesha
265022               milez
461751            ianjonas
726352      kennedylizrose
                 ...      
9893907            jnawsty
10604879           cami123
10663692           erfrost
10735452            joelca
10821098             etiam
Name: user_id, Length: 67, dtype: object
