<a href="https://colab.research.google.com/github/brianhphillips/testrepo/blob/main/IMDb_Writer_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

folder_path = '/content/drive/MyDrive/'

# File paths
basics_path = folder_path + 'title.basics.tsv'
ratings_path = folder_path + 'title.ratings.tsv'
crew_path = folder_path + 'title.crew.tsv'
names_path = folder_path + 'name.basics.tsv'

Mounted at /content/drive


In [2]:
import pandas as pd

# Load and filter for movies
basics = pd.read_csv(basics_path, sep='\t', dtype=str, na_values='\\N')
movies = basics[basics['titleType'] == 'movie']

# Load ratings
ratings = pd.read_csv(ratings_path, sep='\t', dtype={'averageRating': float, 'numVotes': int}, na_values='\\N')

# Merge to get movies with ratings
movies_with_ratings = pd.merge(movies, ratings, on='tconst')

# Optional: filter out movies with low vote counts
movies_with_ratings = movies_with_ratings[movies_with_ratings['numVotes'] >= 30000]

In [3]:
# Load crew data
crew = pd.read_csv(crew_path, sep='\t', dtype=str, na_values='\\N')

# Merge to get writers
movies_with_writers = pd.merge(movies_with_ratings, crew[['tconst', 'writers']], on='tconst')
movies_with_writers = movies_with_writers.dropna(subset=['writers'])

# Expand multiple writers into separate rows
movies_with_writers['writers'] = movies_with_writers['writers'].str.split(',')
movies_with_writers = movies_with_writers.explode('writers')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_with_writers['writers'] = movies_with_writers['writers'].str.split(',')


In [4]:
# Load name data
names = pd.read_csv(names_path, sep='\t', dtype=str, na_values='\\N')

# Merge to get writer names
writers_named = pd.merge(movies_with_writers, names[['nconst', 'primaryName']], left_on='writers', right_on='nconst')

In [5]:
# Group by writer name
writer_stats = writers_named.groupby('primaryName').agg(
    avg_rating=('averageRating', 'mean'),
    num_movies=('tconst', 'count')
).reset_index()

# Filter for writers with at least 3 credited movies
writer_stats = writer_stats[writer_stats['num_movies'] >= 3]

# Sort and get top 20
top_writers = writer_stats.sort_values(by='avg_rating', ascending=False).head(50)

top_writers

Unnamed: 0,primaryName,avg_rating,num_movies
3944,Jonathan Nolan,8.6,5
6650,Sanjay Upadhyay,8.466667,3
6455,Ron Fricke,8.366667,3
7632,Vidhu Vinod Chopra,8.3,4
3408,Jeethu Joseph,8.3,4
1863,Denis Villeneuve,8.266667,3
5249,Michael Wilson,8.25,4
991,Bráulio Mantovani,8.2,3
1132,Charles Chaplin,8.185714,7
1320,Christopher Nolan,8.158333,12
