In [2]:
import pandas as pd

In [5]:
# Load the CSV file with low_memory=False and without specifying dtypes initially
df = pd.read_csv('raw_movie_data.csv', low_memory=False)

# Convert relevant columns to numeric, coercing errors into NaN
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df['Budget'] = pd.to_numeric(df['Budget'], errors='coerce')
df['Revenue'] = pd.to_numeric(df['Revenue'], errors='coerce')
df['Ratings'] = pd.to_numeric(df['Ratings'], errors='coerce')
df['Vote Count'] = pd.to_numeric(df['Vote Count'], errors='coerce')
df['Popularity'] = pd.to_numeric(df['Popularity'], errors='coerce')
df['Movie ID'] = pd.to_numeric(df['Movie ID'], errors='coerce')

# Convert 'Adult' column to boolean (it may have values like "True/False" or 0/1)
df['Adult'] = df['Adult'].astype(bool)

In [6]:
df.head()

Unnamed: 0,Title,Year,Director,Producer,Genres,Summary,Duration,Budget,Revenue,Ratings,Vote Count,Popularity,Content Rating,Original Language,Production Companies,Production Countries,Spoken Languages,Tagline,Adult,Movie ID
0,The Quest for Tom Sawyer's Gold,2023.0,Kirk Harris,Jerome Reygner-Kalfon,"Adventure, Family",Agatha Armstrong and her trusty sidekick Mrs. ...,88.0,0.0,107398.0,4.65,10.0,363.302,PG,en,"Ace Entertainment Films, Grindstone Entertainm...",United States of America,English,The enigma of the lost treasure.,True,966576.0
1,The Super Mario Bros. Movie,2023.0,Aaron Horvath,Chris Meledandri,"Animation, Family, Adventure, Fantasy, Comedy","While working underground to fix a water main,...",93.0,100000000.0,1362000000.0,7.654,8932.0,313.758,G,en,"Universal Pictures, Illumination, Nintendo","Japan, United States of America",English,Not all heroes wear capes. Some wear overalls.,True,502356.0
2,Transformers: Rise of the Beasts,2023.0,Steven Caple Jr.,Mark Vahradian,"Science Fiction, Adventure, Action",When a new threat capable of destroying the en...,127.0,195000000.0,439000000.0,7.307,4484.0,267.861,12A,en,"Skydance Media, Paramount Pictures, di Bonaven...",United States of America,", Espa√±ol, English",Unite or fall.,True,667538.0
3,Mayhem!,2023.0,Xavier Gens,Vincent Roget,"Action, Crime, Drama",Sam is a professional boxer about to get relea...,99.0,4920000.0,1500000.0,6.7,153.0,266.5,R,fr,"France 2 Cin√©ma, Same Player, The Ink Connect...",France,"English, Fran√ßais, ‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢",This ends one way.,True,959092.0
4,Migration,2023.0,Benjamin Renner,Chris Meledandri,"Animation, Action, Adventure, Comedy, Family",After a migrating duck family alights on their...,83.0,72000000.0,298776100.0,7.446,1712.0,250.783,U,en,"Universal Pictures, Illumination",United States of America,English,Odd ducks welcome.,True,940551.0


In [8]:
df.dtypes

Title                    object
Year                    float64
Director                 object
Producer                 object
Genres                   object
Summary                  object
Duration                float64
Budget                  float64
Revenue                 float64
Ratings                 float64
Vote Count              float64
Popularity              float64
Content Rating           object
Original Language        object
Production Companies     object
Production Countries     object
Spoken Languages         object
Tagline                  object
Adult                      bool
Movie ID                float64
dtype: object

In [9]:
# Convert all object columns to string type
df['Title'] = df['Title'].astype('string')
df['Director'] = df['Director'].astype('string')
df['Producer'] = df['Producer'].astype('string')
df['Genres'] = df['Genres'].astype('string')
df['Summary'] = df['Summary'].astype('string')
df['Content Rating'] = df['Content Rating'].astype('string')
df['Original Language'] = df['Original Language'].astype('string')
df['Production Companies'] = df['Production Companies'].astype('string')
df['Production Countries'] = df['Production Countries'].astype('string')
df['Spoken Languages'] = df['Spoken Languages'].astype('string')
df['Tagline'] = df['Tagline'].astype('string')

In [11]:
df.dtypes

Title                   string[python]
Year                           float64
Director                string[python]
Producer                string[python]
Genres                  string[python]
Summary                 string[python]
Duration                       float64
Budget                         float64
Revenue                        float64
Ratings                        float64
Vote Count                     float64
Popularity                     float64
Content Rating          string[python]
Original Language       string[python]
Production Companies    string[python]
Production Countries    string[python]
Spoken Languages        string[python]
Tagline                 string[python]
Adult                             bool
Movie ID                       float64
dtype: object

In [17]:
# Disable scientific notation for floats
pd.options.display.float_format = '{:,.2f}'.format

In [18]:
# Count the number of movies for each director
director_counts = df['Director'].value_counts()

# Filter directors with at least 5 movie credits
directors_with_5_credits = director_counts[director_counts >= 5].index

# Filter the DataFrame to include only movies from directors with 5+ credits
df_directors_filtered = df[df['Director'].isin(directors_with_5_credits)]

In [19]:
# Count the number of movies for each producer
producer_counts = df['Producer'].value_counts()

# Filter producers with at least 5 movie credits
producers_with_5_credits = producer_counts[producer_counts >= 5].index

# Filter the DataFrame to include only movies from producers with 5+ credits
df_producers_filtered = df[df['Producer'].isin(producers_with_5_credits)]


In [20]:
# Average rating for directors with at least 5 movies
top_directors = df_directors_filtered.groupby('Director')['Ratings'].mean().sort_values(ascending=False)
print(top_directors.head(10))  # Top 10 directors by average rating with 5+ movie credits

Director
Tony Oppedisano     10.00
YAGOO                9.72
Ken Brewer           9.10
Park Jun-soo         8.63
Eric Notarnicola     8.38
Andy Mikita          8.13
Stanley Kubrick      8.01
Seth Breedlove       8.00
Hayao Miyazaki       7.92
Christopher Nolan    7.86
Name: Ratings, dtype: float64


In [21]:
# Total revenue for producers with at least 5 movies
top_producers = df_producers_filtered.groupby('Producer')['Revenue'].sum().sort_values(ascending=False)
print(top_producers.head(10))  # Top 10 producers by total revenue with 5+ movie credits

Producer
Kevin Feige         25,194,584,521.00
Jerry Bruckheimer    9,944,401,049.00
David Heyman         7,679,766,836.00
Chris Meledandri     7,240,074,144.00
Neal H. Moritz       6,989,542,060.00
James Cameron        6,129,909,250.00
Frank Marshall       5,632,186,229.00
Christopher Nolan    5,370,972,156.00
Barbara Broccoli     4,871,883,560.00
Kathleen Kennedy     4,469,607,547.00
Name: Revenue, dtype: float64
