<a href="https://colab.research.google.com/github/brianhphillips/testrepo/blob/main/IMDb_Actor_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Step 2: Import necessary libraries
import pandas as pd
import gzip
import os
from tqdm import tqdm

In [3]:
# Step 3: Define file paths (adjust if your folder structure differs)
imdb_dir = "/content/drive/MyDrive/"  # Replace if different
csv_path = "/content/drive/MyDrive/Actors - Sheet1.csv"  # Adjust path if needed

# IMDb dataset files
name_basics_path = imdb_dir + "name.basics.tsv"
title_basics_path = imdb_dir + "title.basics.tsv"
title_principals_path = imdb_dir + "title.principals.tsv"
title_ratings_path = imdb_dir + "title.ratings.tsv"

In [4]:
# Step 4: Load actor names
actor_df = pd.read_csv(csv_path)
actor_names = set(actor_df['Name'].dropna().unique())

In [6]:
# Step 5: Load name.basics.tsv to get nconst IDs for actors
def load_nconsts_for_actors():
    nconsts = {}
    with open(name_basics_path, 'rt', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if parts[0] == 'nconst':
                continue
            nconst, name = parts[0], parts[1]
            if name in actor_names:
                nconsts[name] = nconst
    return nconsts

actor_nconsts = load_nconsts_for_actors()

In [7]:
# Step 6: Load title.principals.tsv and filter for actor roles
actor_roles = {'actor', 'actress'}

def get_actor_titles(nconst_list):
    actor_titles = {n: [] for n in nconst_list}
    with open(title_principals_path, 'rt', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if parts[0] == 'tconst':
                continue
            tconst, ordering, nconst, category = parts[0], parts[1], parts[2], parts[3]
            if nconst in actor_titles and category.lower() in actor_roles:
                actor_titles[nconst].append(tconst)
    return actor_titles

actor_title_map = get_actor_titles(set(actor_nconsts.values()))

In [8]:
# Step 7: Filter title.basics.tsv for feature films only
def get_feature_films():
    film_titles = set()
    with open(title_basics_path, 'rt', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if parts[0] == 'tconst':
                continue
            tconst, titleType = parts[0], parts[1]
            if titleType == 'movie':
                film_titles.add(tconst)
    return film_titles

feature_films = get_feature_films()

In [9]:
# Step 8: Load title.ratings.tsv into DataFrame
ratings_df = pd.read_csv(title_ratings_path, sep='\t')

In [10]:
# Step 9: Calculate average rating for each actor
actor_avg_scores = {}

for name, nconst in tqdm(actor_nconsts.items()):
    titles = actor_title_map.get(nconst, [])
    film_titles = [t for t in titles if t in feature_films]
    ratings = ratings_df[ratings_df['tconst'].isin(film_titles)]
    if not ratings.empty:
        avg_score = ratings['averageRating'].mean()
        actor_avg_scores[name] = avg_score
    else:
        actor_avg_scores[name] = None

100%|██████████| 136/136 [00:08<00:00, 16.70it/s]


In [11]:
# Step 10: Create result DataFrame and save to CSV
result_df = pd.DataFrame.from_dict(actor_avg_scores, orient='index', columns=['averageRating']).reset_index()
result_df.rename(columns={'index': 'Name'}, inplace=True)

# Save to Drive
result_df.to_csv("/content/drive/MyDrive/actor_avg_imdb_scores.csv", index=False)

In [12]:
display(result_df)

Unnamed: 0,Name,averageRating
0,Julie Andrews,
1,Anthony Daniels,
2,Suzy Amis,6.012500
3,Warwick Davis,
4,Joseph Mazzello,6.593333
...,...,...
131,Josh Cowdery,
132,Zendaya,7.300000
133,Winston Duke,6.750000
134,Antony Acheampong,6.050000
