In [1]:
import pandas as pd

In [2]:
# Load the CSV file with low_memory=False and without specifying dtypes initially
df = pd.read_csv('raw_movie_data.csv', low_memory=False)

# Convert relevant columns to numeric, coercing errors into NaN
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df['Budget'] = pd.to_numeric(df['Budget'], errors='coerce')
df['Revenue'] = pd.to_numeric(df['Revenue'], errors='coerce')
df['Ratings'] = pd.to_numeric(df['Ratings'], errors='coerce')
df['Vote Count'] = pd.to_numeric(df['Vote Count'], errors='coerce')
df['Popularity'] = pd.to_numeric(df['Popularity'], errors='coerce')
df['Movie ID'] = pd.to_numeric(df['Movie ID'], errors='coerce')

# Convert all object columns to string type
df['Title'] = df['Title'].astype('string')
df['Director'] = df['Director'].astype('string')
df['Producer'] = df['Producer'].astype('string')
df['Genres'] = df['Genres'].astype('string')
df['Summary'] = df['Summary'].astype('string')
df['Content Rating'] = df['Content Rating'].astype('string')
df['Original Language'] = df['Original Language'].astype('string')
df['Production Companies'] = df['Production Companies'].astype('string')
df['Production Countries'] = df['Production Countries'].astype('string')
df['Spoken Languages'] = df['Spoken Languages'].astype('string')
df['Tagline'] = df['Tagline'].astype('string')

# Convert 'Adult' column to boolean (it may have values like "True/False" or 0/1)
df['Adult'] = df['Adult'].astype(bool)

In [3]:
# Disable scientific notation for floats
pd.options.display.float_format = '{:,.2f}'.format

In [4]:
df.dtypes

Title                   string[python]
Year                           float64
Director                string[python]
Producer                string[python]
Genres                  string[python]
Summary                 string[python]
Duration                       float64
Budget                         float64
Revenue                        float64
Ratings                        float64
Vote Count                     float64
Popularity                     float64
Content Rating          string[python]
Original Language       string[python]
Production Companies    string[python]
Production Countries    string[python]
Spoken Languages        string[python]
Tagline                 string[python]
Adult                             bool
Movie ID                       float64
dtype: object

In [5]:
# Get number of rows prior to genre cleaning
pre_genre_clean = len(df)

# Remove films with no genres (NaN or empty)
df = df[df['Genres'].notna() & (df['Genres'] != '')]

# Get number of rows after genre cleaning
post_genre_clean = len(df)

# Check how many movies were removed
removed_no_genre_count = pre_genre_clean - post_genre_clean
print(f"Removed {removed_no_genre_count} films with no genres.")

Removed 1519 films with no genres.


In [6]:
# Ensure 'Genres' is split properly as a comma-separated string, or only has one genre
df['Genres'] = df['Genres'].apply(
    lambda x: [genre.strip() for genre in x.split(',')] if isinstance(x, str) else []
)

# Define the list of genres that represent stand-up comedy
standup_genres = ['Comedy', 'Documentary']

# Function to check if a movie is stand-up based on its genres
def is_standup(genres):
    # Sort both lists so that order doesn't affect the comparison
    return sorted(genres) == sorted(standup_genres)

# Get number of rows prior to stand-up removal
pre_genre_clean = len(df)

# Apply the filter to remove stand-up comedy movies
df = df[~df['Genres'].apply(is_standup)]

# Get number of rows after stand-up removal
post_genre_clean = len(df)

# Check how many movies were removed
removed_standup_count = pre_genre_clean - post_genre_clean
print(f"Removed {removed_standup_count} thought as stand-up comedy.")

Removed 136 thought as stand-up comedy.


In [7]:
# Get number of rows prior to stand-up removal
pre_nan_clan = len(df)

# Remove movies with no Director or Producer
df = df.dropna(subset=['Director', 'Producer'])

# Get number of rows after stand-up removal
post_nan_clean = len(df)

# Check how many movies were removed
removed_dir_or_pro = pre_nan_clan - post_nan_clean
print(f"Removed {removed_dir_or_pro} as they had no director or producer.")

Removed 11435 thought as stand-up comedy.


In [8]:
# Step 1: Count the number of movies for each director
director_counts = df['Director'].value_counts()

# Step 2: Filter directors with at least 5 movies
directors_with_5_plus_movies = director_counts[director_counts >= 5].index

# Step 3: Calculate average ratings for these directors
average_ratings = df[df['Director'].isin(directors_with_5_plus_movies)]
top_directors = average_ratings.groupby('Director')['Ratings'].mean().sort_values(ascending=False)

# Print top 10 directors by average rating with 5+ movie credits
print(top_directors.head(10))

Director
YAGOO               9.72
Park Jun-soo        8.40
David Mallet        8.09
Stanley Kubrick     8.01
Hayao Miyazaki      7.92
Paul Dugdale        7.91
Christopher Nolan   7.86
Quentin Tarantino   7.84
Makoto Shinkai      7.79
Akira Kurosawa      7.73
Name: Ratings, dtype: float64
