Oscar Nominees Deep Dive Analysis

Problem Statement:
This script performs comprehensive analysis of Oscar nominees and their movies to:
1. Identify winning patterns by actor and movie characteristics
2. Analyze genre performance across different award categories
3. Calculate age at nomination for demographic insights
4. Determine nomination frequency and win ratios

Enhanced Data Analysis:
- Movie success rate by nominee's primary genre
- Age when nominated (using birthday and nomination year)
- Win percentage by category type
- Most frequent movie genres among winners
- Actor nomination streaks across years

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc, sum, year, date_diff, months_between, to_date, round

# Initialize Spark session
spark = SparkSession.builder.appName("OscarWinsAnalysis").getOrCreate()

# Sample data for nominee_information
nominee_data = [
    ('Jennifer Lawrence', 'P562566', 'Drama', '1990-08-15', 755),
    ('Jonah Hill', 'P418718', 'Comedy', '1983-12-20', 747),
    ('Anne Hathaway', 'P292630', 'Drama', '1982-11-12', 744),
    ('Jennifer Hudson', 'P454405', 'Drama', '1981-09-12', 742),
    ('Rinko Kikuchi', 'P475244', 'Drama', '1981-01-06', 739)
]

# Sample data for oscar_nominees
oscar_data = [
    (2008, 'actress in a leading role', 'Anne Hathaway', 'Rachel Getting Married', 0, 77),
    (2012, 'actress in a supporting role', 'Anne HathawayLes', 'Mis_rables', 1, 78),
    (2006, 'actress in a supporting role', 'Jennifer Hudson', 'Dreamgirls', 1, 711),
    (2010, 'actress in a leading role', 'Jennifer Lawrence', 'Winters Bone', 1, 717),
    (2012, 'actress in a leading role', 'Jennifer Lawrence', 'Silver Linings Playbook', 1, 718),
    (2011, 'actor in a supporting role', 'Jonah Hill', 'Moneyball', 0, 799),
    (2006, 'actress in a supporting role', 'Rinko Kikuchi', 'Babel', 0, 1253)
]

# Define schema for nominee_information
columns_nominee = ["name", "amg_person_id", "top_genre", "birthday", "id"]

# Define schema for oscar_nominees
columns_oscar = ["year", "category", "nominee", "movie", "winner", "id"]

# Create DataFrames
df_nominee = spark.createDataFrame(nominee_data, columns_nominee)
df_oscar = spark.createDataFrame(oscar_data, columns_oscar)

df_nominee.show()
df_oscar.show()

# 1. Calculate age at nomination (in years)
analysis_df = df_oscar.join(df_nominee, df_oscar.nominee == df_nominee.name, "left") \
    .withColumn("nomination_year", col("year").cast("string")) \
    .withColumn("nomination_year", year(to_date(col("nomination_year"), "yyyy"))) \
    .withColumn("age_at_nomination", 
    round(months_between(to_date(col("nomination_year").cast("string"), "yyyy"), to_date(col("birthday"), "yyyy-MM-dd")) / 12, 1)) 

# 2. Win percentage by nominee
win_stats = analysis_df.groupBy("name") \
    .agg(
        count("*").alias("total_nominations"),
        sum("winner").alias("total_wins"),
        round(sum("winner") / count("*") * 100, 2).alias("win_percentage")
    ) \
    .orderBy(desc("total_wins")) 

# 3. Genre performance analysis
genre_stats = analysis_df.groupBy("top_genre") \
    .agg(
        count("*").alias("total_nominations"),
        sum("winner").alias("total_wins"),
        round(sum("winner") / count("*") * 100, 2).alias("win_percentage")
    ) \
    .orderBy(desc("win_percentage"))


# 4. Category performance analysis
category_stats = analysis_df.groupBy("category") \
    .agg(
        count("*").alias("total_nominations"),
        sum("winner").alias("total_wins"),
        round(sum("winner") / count("*") * 100, 2).alias("win_percentage")
    ) \
    .orderBy(desc("total_nominations"))

# 5. Age performance analysis
age_stats = analysis_df.groupBy("age_at_nomination") \
    .agg(
        count("*").alias("total_nominations"),
        sum("winner").alias("total_wins"),
        round(sum("winner") / count("*") * 100, 2).alias("win_percentage")
    ) \
    .orderBy(desc("age_at_nomination"))

# Show results
print("\n1. Nominees Age analysis:")
analysis_df.select("name", "birthday", "age_at_nomination").show()

print("\n2. Win percentage by Nominee:")
win_stats.show()

print("\n3. Genre performance analysis:")
genre_stats.show()

print("\n4. Category Statistics:")
category_stats.show()

print("\n5. Age performance analysis:")
age_stats.show()
# Stop the Spark session
spark.stop()