<a href="https://colab.research.google.com/github/brianhphillips/testrepo/blob/main/IMDb_Producer_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# STEP 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# STEP 2: Import Libraries
import pandas as pd

# Change this path to wherever you've stored your IMDb .tsv files
BASE_PATH = "/content/drive/MyDrive/"

In [4]:
# STEP 3: Load only the necessary columns to reduce memory usage

# List of target producers
producer_names = [
    "Jon Landau", "Mark Nielsen", "Jon Favreau", "Jonas Rivera", "J.K. Rowling",
    "Kevin Feige", "Zane Weiner", "James Cameron", "Janet Healy", "Tom DeSanto",
    "Darla Anderson", "Bryan Burk", "Carolynne Cunningham", "Christopher McQuarrie",
    "Christopher Nolan", "Michael Fottrell", "Simon Emanuel", "Melissa Cobb"
]

# name.basics.tsv - to get nconst for each name
names = pd.read_csv(
    BASE_PATH + "name.basics.tsv",
    sep="\t",
    usecols=["nconst", "primaryName"],
    dtype=str,
    na_values="\\N"
)
matched_names = names[names['primaryName'].isin(producer_names)]

In [5]:
# Set chunk size (you can tune this if needed)
CHUNK_SIZE = 100_000

# Keep track of rows that match our producers
filtered_producer_rows = []

# Convert matched nconsts to a set for fast lookup
target_nconsts = set(matched_names['nconst'])

# Path to principals file
principals_path = BASE_PATH + "title.principals.tsv"

# Read in chunks
with pd.read_csv(
    principals_path,
    sep="\t",
    usecols=["tconst", "nconst", "category"],
    dtype=str,
    na_values="\\N",
    chunksize=CHUNK_SIZE
) as reader:
    for chunk in reader:
        # Filter rows that match the producer nconsts AND category = "producer"
        filtered = chunk[
            (chunk["nconst"].isin(target_nconsts)) &
            (chunk["category"].str.lower() == "producer")
        ]
        filtered_producer_rows.append(filtered)

# Concatenate all filtered rows
producers = pd.concat(filtered_producer_rows, ignore_index=True)

In [6]:
# STEP 5: Load title.basics.tsv and filter to movies only

titles = pd.read_csv(
    BASE_PATH + "title.basics.tsv",
    sep="\t",
    usecols=["tconst", "titleType"],
    dtype=str,
    na_values="\\N"
)
titles = titles[titles["titleType"] == "movie"]

In [7]:
# STEP 6: Join producers with movies
producer_movies = producers.merge(titles, on="tconst")

In [8]:
# STEP 7: Load title.ratings.tsv and join ratings
ratings = pd.read_csv(
    BASE_PATH + "title.ratings.tsv",
    sep="\t",
    usecols=["tconst", "averageRating"],
    dtype={'tconst': str, 'averageRating': float},
    na_values="\\N"
)

# Merge ratings with producer-movie data
producer_movies_rated = producer_movies.merge(ratings, on="tconst")

In [9]:
# STEP 8: Add back producer names and calculate average scores
producer_movies_named = producer_movies_rated.merge(matched_names, on="nconst")

# Group by name and compute average rating
average_scores = producer_movies_named.groupby("primaryName")["averageRating"].mean().reset_index()

# Sort (optional)
average_scores = average_scores.sort_values(by="averageRating", ascending=False)

# Display
print(average_scores)

              primaryName  averageRating
3       Christopher Nolan       8.130000
9            Jonas Rivera       8.000000
11           Mark Nielsen       7.550000
1    Carolynne Cunningham       7.450000
16            Zane Weiner       7.450000
2   Christopher McQuarrie       7.366667
12           Melissa Cobb       7.333333
5           James Cameron       7.200000
10            Kevin Feige       7.188889
4            J.K. Rowling       7.140000
8              Jon Landau       7.133333
14          Simon Emanuel       7.066667
0              Bryan Burk       6.977778
7             Jon Favreau       6.925000
6             Janet Healy       6.683333
13       Michael Fottrell       6.642857
15            Tom DeSanto       6.300000
