<a href="https://colab.research.google.com/github/brianhphillips/testrepo/blob/main/Director_IMDb_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os

# Update this path if your folder is named differently
imdb_path = "/content/drive/MyDrive/"

files = {
    'name_basics': os.path.join(imdb_path, 'name.basics.tsv'),
    'title_crew': os.path.join(imdb_path, 'title.crew.tsv'),
    'title_basics': os.path.join(imdb_path, 'title.basics.tsv'),
    'title_ratings': os.path.join(imdb_path, 'title.ratings.tsv')
}

In [6]:
import pandas as pd

target_directors = [
    "Kyle Balda", "Chris Buck", "Anthony Russo", "Pierre Coffin",
    "Joss Whedon", "Chris Renaud", "Joe Russo", "J.J. Abrams",
    "Jon Watts", "David Yates", "James Cameron", "Lee Unkrich"
]

# Only load the columns we need
name_df = pd.read_csv(files['name_basics'], sep='\t', usecols=['nconst', 'primaryName'], na_values='\\N', dtype=str)
directors_df = name_df[name_df['primaryName'].isin(target_directors)]

In [7]:
from tqdm import tqdm

def get_titles_for_directors_in_chunks(crew_path, director_ids, chunk_size=100000):
    result_rows = []
    chunks = pd.read_csv(crew_path, sep='\t', na_values='\\N', dtype=str, chunksize=chunk_size)

    for chunk in tqdm(chunks, desc="Processing title.crew.tsv"):
        chunk = chunk.dropna(subset=['directors'])
        chunk['directors'] = chunk['directors'].str.split(',')
        exploded = chunk.explode('directors')
        filtered = exploded[exploded['directors'].isin(director_ids)]
        result_rows.append(filtered[['tconst', 'directors']])

    return pd.concat(result_rows, ignore_index=True)

title_directors_df = get_titles_for_directors_in_chunks(files['title_crew'], directors_df['nconst'].tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['directors'] = chunk['directors'].str.split(',')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['directors'] = chunk['directors'].str.split(',')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['directors'] = chunk['directors'].str.split(',')
A value is trying to be set on a copy of

In [8]:
def get_movies_by_ids(basics_path, tconst_list, chunk_size=100000):
    matched = []
    chunks = pd.read_csv(basics_path, sep='\t', usecols=['tconst', 'titleType'], na_values='\\N', dtype=str, chunksize=chunk_size)

    for chunk in tqdm(chunks, desc="Filtering title.basics.tsv"):
        movies = chunk[chunk['titleType'] == 'movie']
        filtered = movies[movies['tconst'].isin(tconst_list)]
        matched.append(filtered)

    return pd.concat(matched, ignore_index=True)

movie_titles_df = get_movies_by_ids(files['title_basics'], title_directors_df['tconst'].unique().tolist())

Filtering title.basics.tsv: 117it [00:39,  2.95it/s]


In [9]:
ratings_df = pd.read_csv(files['title_ratings'], sep='\t', na_values='\\N', dtype={'tconst': str, 'averageRating': float, 'numVotes': int})

# Join: keep only rated movies directed by our targets
merged = title_directors_df.merge(directors_df, left_on='directors', right_on='nconst') \
                           .merge(movie_titles_df, on='tconst') \
                           .merge(ratings_df, on='tconst')

In [10]:
# Average IMDb rating per director
average_ratings = merged.groupby('primaryName')['averageRating'].mean().reset_index()
average_ratings = average_ratings.sort_values(by='averageRating', ascending=False)

print(average_ratings)

      primaryName  averageRating
10    Lee Unkrich       8.180000
8     Joss Whedon       7.525000
5   James Cameron       7.281818
7       Jon Watts       7.137500
3     David Yates       7.063636
4     J.J. Abrams       6.942857
0   Anthony Russo       6.760000
11  Pierre Coffin       6.750000
2    Chris Renaud       6.614286
6       Joe Russo       6.500000
9      Kyle Balda       6.400000
1      Chris Buck       6.100000
