<a href="https://colab.research.google.com/github/brianhphillips/testrepo/blob/main/IMDb_Story_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Define path to your IMDb dataset folder in Drive
imdb_path = "/content/drive/MyDrive/"

Mounted at /content/drive


In [2]:
import pandas as pd

people_of_interest = [
    "Rick Jaffa", "Amanda Silver", "Jennifer Lee", "Brenda Chapman", "James Cameron",
    "Steve Ditko", "J.R.R. Tolkien", "J.K. Rowling", "Stan Lee", "Jack Kirby",
    "Gary Scott Thompson", "Andrew Stanton", "Bob Peterson", "Rich Moore", "Jim Reardon",
    "Robert Wade", "Neal Purvis", "Suzanne Collins", "David S. Goyer", "Larry Lieber"
]

# Load name.basics in chunks to find nconst for target people
name_file = imdb_path + "name.basics.tsv"
name_df = pd.read_csv(name_file, sep="\t", usecols=["nconst", "primaryName"], dtype=str)

# Filter to get their IMDb nconst IDs
selected_people = name_df[name_df["primaryName"].isin(people_of_interest)]
nconst_set = set(selected_people["nconst"])

In [5]:
principal_file = imdb_path + "title.principals.tsv"
filtered_rows = []

with open(principal_file, 'rt') as f:
    header = next(f).strip().split('\t')
    for line in f:
        parts = line.strip().split('\t')
        nconst, category = parts[2], parts[3]
        if nconst in nconst_set and category in {"writer", "story"}:
            filtered_rows.append(dict(zip(header, parts)))

story_df = pd.DataFrame(filtered_rows)

In [6]:
ratings_file = imdb_path + "title.ratings.tsv"
ratings_df = pd.read_csv(ratings_file, sep="\t", usecols=["tconst", "averageRating"], dtype={"tconst": str, "averageRating": float})

In [7]:
story_with_ratings = story_df.merge(ratings_df, on="tconst", how="inner")
story_with_ratings = story_with_ratings.merge(selected_people, on="nconst", how="left")

In [8]:
# Load just movie IDs from title.basics
title_file = imdb_path + "title.basics.tsv"
movie_ids = pd.read_csv(title_file, sep="\t", usecols=["tconst", "titleType"], dtype=str)
movie_ids = movie_ids[ movie_ids["titleType"] == "movie" ][["tconst"]]

# Filter story titles to movies only
story_with_ratings = story_with_ratings.merge(movie_ids, on="tconst", how="inner")

In [9]:
average_scores = (
    story_with_ratings.groupby("primaryName")["averageRating"]
    .mean()
    .sort_values(ascending=False)
    .reset_index()
)

print(average_scores)

            primaryName  averageRating
0           Jim Reardon       7.775000
1        Andrew Stanton       7.744444
2          Bob Peterson       7.733333
3            Rich Moore       7.566667
4        Brenda Chapman       7.550000
5          J.K. Rowling       7.409091
6         James Cameron       6.993333
7        J.R.R. Tolkien       6.975000
8            Rick Jaffa       6.966667
9         Amanda Silver       6.940000
10          Steve Ditko       6.927273
11         Larry Lieber       6.750000
12           Jack Kirby       6.725000
13          Neal Purvis       6.684615
14          Robert Wade       6.684615
15             Stan Lee       6.618750
16      Suzanne Collins       6.550000
17         Jennifer Lee       6.500000
18       David S. Goyer       6.322222
19  Gary Scott Thompson       5.887500
