<a href="https://colab.research.google.com/github/brianhphillips/testrepo/blob/main/IMDb_Director_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

# Replace this with your actual folder path
folder_path = '/content/drive/MyDrive/'

# IMDb dataset paths
basics_path = folder_path + 'title.basics.tsv'
ratings_path = folder_path + 'title.ratings.tsv'
crew_path = folder_path + 'title.crew.tsv'
names_path = folder_path + 'name.basics.tsv'

In [4]:
import os

# Check if the folder path exists
if not os.path.exists(folder_path):
    print(f"Error: The folder path '{folder_path}' does not exist. Please check the path.")
else:
    try:
        # Load title.basics and filter for movies
        basics = pd.read_csv(basics_path, sep='\t', dtype=str, na_values='\\N')
        movies = basics[basics['titleType'] == 'movie']

        # Load title.ratings
        ratings = pd.read_csv(ratings_path, sep='\t', dtype={'averageRating': float, 'numVotes': int}, na_values='\\N')

        # Merge movies with ratings
        movies_with_ratings = pd.merge(movies, ratings, on='tconst')

        # Optional: filter out low-vote movies
        movies_with_ratings = movies_with_ratings[movies_with_ratings['numVotes'] >= 30000]

    except FileNotFoundError as e:
        print(f"Error loading data: {e}. Please ensure the files exist at the specified paths.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [5]:
# Load crew data
crew = pd.read_csv(crew_path, sep='\t', dtype=str, na_values='\\N')

# Merge to get directors
movies_directed = pd.merge(movies_with_ratings, crew[['tconst', 'directors']], on='tconst')
movies_directed = movies_directed.dropna(subset=['directors'])

# Handle multiple directors
movies_directed['directors'] = movies_directed['directors'].str.split(',')
movies_directed = movies_directed.explode('directors')

In [6]:
# Load names
names = pd.read_csv(names_path, sep='\t', dtype=str, na_values='\\N')

# Merge to get director names
directors_named = pd.merge(movies_directed, names[['nconst', 'primaryName']], left_on='directors', right_on='nconst')

In [7]:
# Group by director and compute average rating
director_stats = directors_named.groupby('primaryName').agg(
    avg_rating=('averageRating', 'mean'),
    num_movies=('tconst', 'count')
).reset_index()

# Optional: only include directors with 3+ movies
director_stats = director_stats[director_stats['num_movies'] >= 3]

# Sort by average rating
top_directors = director_stats.sort_values(by='avg_rating', ascending=False).head(20)

# Display result
top_directors

Unnamed: 0,primaryName,avg_rating,num_movies
404,Charles Chaplin,8.3,6
2329,Sergio Leone,8.2,6
1530,Lee Unkrich,8.18,5
808,Frank Capra,8.175,4
828,Fritz Lang,8.166667,3
469,Christopher Nolan,8.166667,12
1984,Pete Docter,8.125,4
54,Akira Kurosawa,8.092308,13
602,David Lean,8.075,4
2387,Stanley Kubrick,8.063636,11
