# Bussiness Questions

Bussiness Questions (BQ) answered with the generated tables: 


**BQ9.** Most profitable actors and directors.  

In [1]:
# Libraries used
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

In [2]:
# Create a Spark Session
spark = SparkSession.builder.appName('Challenge').getOrCreate()

# Read Parquet tables into Spark Data Frames

### t_profits

In [3]:
# Read Parquet file into Data Frame
t_profits = spark.read.option('header','true').parquet('master/t_profits.parquet')

# Manually define Schema
t_profits = t_profits.select(
    t_profits.movie_id.cast('string'),
    t_profits.title.cast('string'),
    t_profits.year.cast('string'),
    t_profits.box_office_usd.cast('int'),
    t_profits.budget_usd.cast('int'),
    t_profits.inflation_factor_2023.cast('double')
)

t_profits.printSchema()
t_profits.show(5)

root
 |-- movie_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)
 |-- box_office_usd: integer (nullable = true)
 |-- budget_usd: integer (nullable = true)
 |-- inflation_factor_2023: double (nullable = true)

+--------+--------------------+----+--------------+----------+---------------------+
|movie_id|               title|year|box_office_usd|budget_usd|inflation_factor_2023|
+--------+--------------------+----+--------------+----------+---------------------+
|       1|The Shawshank Red...|1994|      28884504|  25000000|             2.030414|
|       2|       The Godfather|1972|     250341816|   6000000|             7.183692|
|       3|     The Dark Knight|2008|    1006234167| 185000000|             1.395895|
|       4|The Godfather Par...|1974|      47961919|  13000000|             6.105467|
|       5|        12 Angry Men|1957|           955|    350000|             10.64304|
+--------+--------------------+----+--------------+-------

In [4]:
# t_actors
t_actors = spark.read.parquet('master/t_actors.parquet', header=True)

t_actors.printSchema()
t_actors.show(5)

root
 |-- movie_id: long (nullable = true)
 |-- actor: string (nullable = true)

+--------+--------------+
|movie_id|         actor|
+--------+--------------+
|       1|   Tim Robbins|
|       1|Morgan Freeman|
|       1|    Bob Gunton|
|       1|William Sadler|
|       1|  Clancy Brown|
+--------+--------------+
only showing top 5 rows



In [5]:
# t_directors
t_directors = spark.read.parquet('master/t_directors.parquet', header=True)

t_directors.printSchema()
t_directors.show(5)

root
 |-- movie_id: long (nullable = true)
 |-- director: string (nullable = true)

+--------+--------------------+
|movie_id|            director|
+--------+--------------------+
|       1|      Frank Darabont|
|       2|Francis Ford Coppola|
|       3|   Christopher Nolan|
|       4|Francis Ford Coppola|
|       5|        Sidney Lumet|
+--------+--------------------+
only showing top 5 rows



In [6]:
# Create views of the tables to run SQL queries
t_profits.createOrReplaceTempView('t_profits') 
t_actors.createOrReplaceTempView('t_actors') 
t_directors.createOrReplaceTempView('t_directors')

## Join t_profits & t_actors.

In [7]:
profits_actors = spark.sql("""
    SELECT 
        p.movie_id,
        p.title,
        p.year,
        p.box_office_usd,
        p.budget_usd,
        p.inflation_factor_2023,
        a.actor
    FROM t_profits p
    INNER JOIN t_actors a
    ON p.movie_id = a.movie_id
""")

# Display
print(f'\n{profits_actors.count()} rows in total')
profits_actors.show(5)


3944 rows in total
+--------+--------------------+----+--------------+----------+---------------------+--------------+
|movie_id|               title|year|box_office_usd|budget_usd|inflation_factor_2023|         actor|
+--------+--------------------+----+--------------+----------+---------------------+--------------+
|       1|The Shawshank Red...|1994|      28884504|  25000000|             2.030414|   Tim Robbins|
|       1|The Shawshank Red...|1994|      28884504|  25000000|             2.030414|Morgan Freeman|
|       1|The Shawshank Red...|1994|      28884504|  25000000|             2.030414|    Bob Gunton|
|       1|The Shawshank Red...|1994|      28884504|  25000000|             2.030414|William Sadler|
|       1|The Shawshank Red...|1994|      28884504|  25000000|             2.030414|  Clancy Brown|
+--------+--------------------+----+--------------+----------+---------------------+--------------+
only showing top 5 rows



In [8]:
# Create temporary view of join to run queries
profits_actors.createOrReplaceTempView('profits_actors')  

## BQ9. [Part 1] Most profitable actors.
### By average profit, and at least 3 appearances in the Top-250 IMDb movies.

In [9]:
# Most profitable actors, by average profit and at least 3 appearances in the Top-250 IMDb movies
print('\n*In millions of dollars\n')
spark.sql("""
    SELECT 
        actor, 
        CAST((AVG(box_office_usd) - AVG(budget_usd)) / 1000000 AS DECIMAL(5,1)) AS avg_profit,
        CAST(AVG(box_office_usd) / 1000000 AS DECIMAL(5,1)) AS avg_box_office_usd,
        CAST(AVG(budget_usd) / 1000000 AS DECIMAL(5,1)) AS avg_budget_usd,
        COUNT(title) AS movie_count
    FROM profits_actors
    GROUP BY actor
    HAVING movie_count >= 3
    ORDER BY avg_profit DESC
""").show(10)


*In millions of dollars

+---------------+----------+------------------+--------------+-----------+
|          actor|avg_profit|avg_box_office_usd|avg_budget_usd|movie_count|
+---------------+----------+------------------+--------------+-----------+
| Cate Blanchett|     903.9|             997.5|          93.7|          3|
|     Sean Astin|     903.9|             997.5|          93.7|          3|
|     Billy Boyd|     903.9|             997.5|          93.7|          3|
|   Ian McKellen|     903.9|             997.5|          93.7|          3|
|Viggo Mortensen|     752.6|             828.6|          76.0|          4|
|    Elijah Wood|     691.4|             766.7|          75.3|          4|
|   Willem Dafoe|     686.3|             763.8|          77.5|          4|
|  Orlando Bloom|     677.4|             748.3|          70.9|          4|
|   Bob Peterson|     579.2|             723.9|         144.7|          3|
|   David Prowse|     575.8|             596.3|          20.5|          3|

### Conclusion 1
We can see that members from the cast of the Lord of the Rings saga dominate this ranking!

## Adjust amounts for inflation

In [10]:
# Most profitable actors, by average profit and at least 3 appearances in the Top-250 IMDb movies
print('\n*In millions of dollars')
profits_actors_i = spark.sql("""
    SELECT 
        movie_id,
        title,
        year,
        CAST(box_office_usd * inflation_factor_2023 AS int) AS box_office_usd_i,
        CAST(budget_usd * inflation_factor_2023 AS int) AS budget_usd_i,
        actor
    FROM profits_actors
""")
# Display
print(f'\n{profits_actors_i.count()} rows in total')
profits_actors_i.show(5)


*In millions of dollars

3944 rows in total
+--------+--------------------+----+----------------+------------+--------------+
|movie_id|               title|year|box_office_usd_i|budget_usd_i|         actor|
+--------+--------------------+----+----------------+------------+--------------+
|       1|The Shawshank Red...|1994|        58647501|    50760350|   Tim Robbins|
|       1|The Shawshank Red...|1994|        58647501|    50760350|Morgan Freeman|
|       1|The Shawshank Red...|1994|        58647501|    50760350|    Bob Gunton|
|       1|The Shawshank Red...|1994|        58647501|    50760350|William Sadler|
|       1|The Shawshank Red...|1994|        58647501|    50760350|  Clancy Brown|
+--------+--------------------+----+----------------+------------+--------------+
only showing top 5 rows



In [11]:
# Create temporary view of join to run queries
profits_actors_i.createOrReplaceTempView('profits_actors_i')  

## BQ9. [Part 2] Most profitable actors (adjusted for inflation)
### By average profit, and at least 3 appearances in the Top-250 IMDb movies.

In [12]:
# Most profitable actors, by average profit and at least 3 appearances in the Top-250 IMDb movies
print('\n*In millions of dollars\n')
spark.sql("""
    SELECT 
        actor, 
        CAST((AVG(box_office_usd_i) - AVG(budget_usd_i)) / 1000000 AS DECIMAL(5,1)) AS avg_profit_i,
        CAST(AVG(box_office_usd_i) / 1000000 AS DECIMAL(5,1)) AS avg_box_office_usd_i,
        CAST(AVG(budget_usd_i) / 1000000 AS DECIMAL(5,1)) AS avg_budget_usd_i,
        COUNT(title) AS movie_count
    FROM profits_actors_i
    GROUP BY actor
    HAVING movie_count >= 3
    ORDER BY avg_profit_i DESC
""").show(10)


*In millions of dollars

+---------------+------------+--------------------+----------------+-----------+
|          actor|avg_profit_i|avg_box_office_usd_i|avg_budget_usd_i|movie_count|
+---------------+------------+--------------------+----------------+-----------+
|   Peter Mayhew|      1777.9|              1850.8|            72.9|          3|
|   David Prowse|      1777.9|              1850.8|            72.9|          3|
|    Mark Hamill|      1777.9|              1850.8|            72.9|          3|
|  Carrie Fisher|      1777.9|              1850.8|            72.9|          3|
|Anthony Daniels|      1777.9|              1850.8|            72.9|          3|
|     Billy Boyd|      1503.9|              1660.0|           156.1|          3|
|     Sean Astin|      1503.9|              1660.0|           156.1|          3|
|   Ian McKellen|      1503.9|              1660.0|           156.1|          3|
| Cate Blanchett|      1503.9|              1660.0|           156.1|          3|
| 

### Conclusion 2
We can see that in contrast to the previous approach, now members from the cast of the Star Wars saga dominate the ranking! Followed by the members from the cast of the Lord of the Rings saga.

## Join t_profits & t_directors.

In [13]:
profits_directors = spark.sql("""
    SELECT 
        p.movie_id,
        p.title,
        p.year,
        p.box_office_usd,
        p.budget_usd,
        p.inflation_factor_2023,
        d.director
    FROM t_profits p
    INNER JOIN t_directors d
    ON p.movie_id = d.movie_id
""")

# Display
print(f'\n{profits_directors.count()} rows in total')
profits_directors.show(5)


251 rows in total
+--------+--------------------+----+--------------+----------+---------------------+--------------------+
|movie_id|               title|year|box_office_usd|budget_usd|inflation_factor_2023|            director|
+--------+--------------------+----+--------------+----------+---------------------+--------------------+
|       1|The Shawshank Red...|1994|      28884504|  25000000|             2.030414|      Frank Darabont|
|       2|       The Godfather|1972|     250341816|   6000000|             7.183692|Francis Ford Coppola|
|       3|     The Dark Knight|2008|    1006234167| 185000000|             1.395895|   Christopher Nolan|
|       4|The Godfather Par...|1974|      47961919|  13000000|             6.105467|Francis Ford Coppola|
|       5|        12 Angry Men|1957|           955|    350000|             10.64304|        Sidney Lumet|
+--------+--------------------+----+--------------+----------+---------------------+--------------------+
only showing top 5 rows



In [14]:
# Create temporary view of join to run queries
profits_directors.createOrReplaceTempView('profits_directors')  

## BQ9. [Part 3] Most profitable directors.
### By average profit, and at least 3 appearances in the Top-250 IMDb movies.

In [15]:
# Most profitable actors, by average profit and at least 3 appearances in the Top-250 IMDb movies
print('\n*In millions of dollars\n')
spark.sql("""
    SELECT 
        director, 
        CAST((AVG(box_office_usd) - AVG(budget_usd)) / 1000000 AS DECIMAL(5,1)) AS avg_profit,
        CAST(AVG(box_office_usd) / 1000000 AS DECIMAL(5,1)) AS avg_box_office_usd,
        CAST(AVG(budget_usd) / 1000000 AS DECIMAL(5,1)) AS avg_budget_usd,
        COUNT(title) AS movie_count
    FROM profits_directors
    GROUP BY director
    HAVING movie_count >= 3
    ORDER BY avg_profit DESC
""").show(10)


*In millions of dollars

+-----------------+----------+------------------+--------------+-----------+
|         director|avg_profit|avg_box_office_usd|avg_budget_usd|movie_count|
+-----------------+----------+------------------+--------------+-----------+
|    Peter Jackson|     903.9|             997.5|          93.7|          3|
|      Lee Unkrich|     671.0|             829.5|         158.5|          4|
|Christopher Nolan|     466.0|             603.0|         137.0|          7|
| Steven Spielberg|     435.6|             484.1|          48.5|          7|
|    James Cameron|     351.2|             423.7|          72.5|          3|
|      Pete Docter|     296.2|             410.0|         113.8|          3|
|   Hayao Miyazaki|     189.6|             207.7|          18.1|          4|
|Quentin Tarantino|     187.2|             229.1|          41.8|          5|
|   Clint Eastwood|     166.2|             190.6|          24.5|          3|
|     Ridley Scott|     165.4|             203.5| 

### Conclusion 3
Peter Jackson from the Lord of the Rings saga holds an impressive return of investment! Followed by animated film director Lee Unkrich.

## Adjust amounts for inflation

In [16]:
# Most profitable directors, by average profit and at least 3 appearances in the Top-250 IMDb movies
print('\n*In millions of dollars')
profits_directors_i = spark.sql("""
    SELECT 
        movie_id,
        title,
        year,
        CAST(box_office_usd * inflation_factor_2023 AS int) AS box_office_usd_i,
        CAST(budget_usd * inflation_factor_2023 AS int) AS budget_usd_i,
        director
    FROM profits_directors
""")

# Display
print(f'\n{profits_directors_i.count()} rows in total')
profits_directors_i.show(5)


*In millions of dollars

251 rows in total
+--------+--------------------+----+----------------+------------+--------------------+
|movie_id|               title|year|box_office_usd_i|budget_usd_i|            director|
+--------+--------------------+----+----------------+------------+--------------------+
|       1|The Shawshank Red...|1994|        58647501|    50760350|      Frank Darabont|
|       2|       The Godfather|1972|      1798378500|    43102152|Francis Ford Coppola|
|       3|     The Dark Knight|2008|      1404597242|   258240575|   Christopher Nolan|
|       4|The Godfather Par...|1974|       292829913|    79371071|Francis Ford Coppola|
|       5|        12 Angry Men|1957|           10164|     3725063|        Sidney Lumet|
+--------+--------------------+----+----------------+------------+--------------------+
only showing top 5 rows



In [17]:
# Create temporary view of join to run queries
profits_directors_i.createOrReplaceTempView('profits_directors_i')  

## BQ9. [Part 4] Most profitable directors (adjusted for inflation)
### By average profit, and at least 3 appearances in the Top-250 IMDb movies.

In [18]:
# Most profitable directors, by average profit and at least 3 appearances in the Top-250 IMDb movies
print('\n*In millions of dollars\n')
spark.sql("""
    SELECT 
        director, 
        CAST((AVG(box_office_usd_i) - AVG(budget_usd_i)) / 1000000 AS DECIMAL(5,1)) AS avg_profit_i,
        CAST(AVG(box_office_usd_i) / 1000000 AS DECIMAL(5,1)) AS avg_box_office_usd_i,
        CAST(AVG(budget_usd_i) / 1000000 AS DECIMAL(5,1)) AS avg_budget_usd_i,
        COUNT(title) AS movie_count
    FROM profits_directors_i
    GROUP BY director
    HAVING movie_count >= 3
    ORDER BY avg_profit_i DESC
""").show(10)


*In millions of dollars

+--------------------+------------+--------------------+----------------+-----------+
|            director|avg_profit_i|avg_box_office_usd_i|avg_budget_usd_i|movie_count|
+--------------------+------------+--------------------+----------------+-----------+
|       Peter Jackson|      1503.9|              1660.0|           156.1|          3|
|         Lee Unkrich|       952.0|              1169.0|           217.0|          4|
|    Steven Spielberg|       932.2|              1030.4|            98.2|          7|
|Francis Ford Coppola|       757.0|               841.1|            84.1|          3|
|   Christopher Nolan|       632.5|               820.8|           188.3|          7|
|       James Cameron|       626.7|               758.7|           132.1|          3|
|         Frank Capra|       515.8|               559.3|            43.5|          3|
|         Pete Docter|       406.0|               567.9|           161.9|          3|
|        Ridley Scott|      

### Conclusion 4
Peter Jackson from the Lord of the Rings saga still holds an impressive return of investment when adjusting figures for inflation! Increasing his lead with respect to animated film director Lee Unkrich.  

Notably, we can see a jump of Steven Spielberg in the ranking, which speaks about his historical influence in cinema! 

In [19]:
spark.stop()