# EDA

In [55]:
import duckdb
import pandas as pd
from pathlib import Path

# I use a path handling to show where the data and sql files are located
duckdb_path = "../data/sakila.duckdb"
sql_load_path = "../sql/load_sakila.sql"

# Removes existing datafile (starts fresh each time)
Path(duckdb_path).unlink(missing_ok=True)


with duckdb.connect(duckdb_path) as conn, open(sql_load_path, "r") as ingest_script:
    # Runs the sql script to load the sakila data into duckdb
    conn.execute(ingest_script.read())

    # loads each table into a pandas dataframe
    print("Loading tables:")
    film = conn.sql("SELECT * FROM film").df()
    actor = conn.sql("SELECT * FROM actor").df()
    film_actor = conn.sql("SELECT * FROM film_actor").df()
    category = conn.sql("SELECT * FROM category").df()
    film_category = conn.sql("SELECT * FROM film_category").df() 
    inventory = conn.sql("SELECT * FROM inventory").df()
    rental = conn.sql("SELECT * FROM rental").df()
    store = conn.sql("SELECT * FROM store").df()
    city = conn.sql("SELECT * FROM city").df()
    address = conn.sql("SELECT * FROM address").df()
    customer = conn.sql("SELECT * FROM customer").df()
    staff = conn.sql("SELECT * FROM staff").df()
    payment = conn.sql("SELECT * FROM payment").df()   
    
    # test prints to verify data loaded correctly
print(f"1st Actor: {actor.iloc[0]['first_name']} {actor.iloc[0]['last_name']}")
print(f"2nd Actor: {actor.iloc[1]['first_name']} {actor.iloc[1]['last_name']}")
print("DATA LOAD SUCCESSFUL")

Loading tables:
1st Actor: PENELOPE GUINESS
2nd Actor: NICK WAHLBERG
DATA LOAD SUCCESSFUL


# Tasks

In [50]:
# 1a

print("MOVIES LONGER THEN 180 MINUTES/3 HOURS")
print("10 MOVIES listed from my query that is longer 180 minutes/3 hours")
print("List descending by Length:")

movie_180min_plus = duckdb.sql("""
    SELECT title, length
    FROM film
    WHERE length > 180
    ORDER BY length DESC
    LIMIT 20
""").df()
movie_180min_plus.index = movie_180min_plus.index + 1 
display(movie_180min_plus)

MOVIES LONGER THEN 180 MINUTES/3 HOURS
10 MOVIES listed from my query that is longer 180 minutes/3 hours
List descending by Length:


Unnamed: 0,title,length
1,CONTROL ANTHEM,185
2,GANGS PRIDE,185
3,MUSCLE BRIGHT,185
4,SWEET BROTHERHOOD,185
5,HOME PITY,185
6,DARN FORRESTER,185
7,CHICAGO NORTH,185
8,POND SEATTLE,185
9,SOLDIERS EVOLUTION,185
10,WORST BANGER,185


In [None]:
# 1b 
# I did similar to 1a but now searching for the word love in the title instead
print("Here is the movies that have the word love in the title:")
movie_title_love = duckdb.sql("""
    SELECT title, rating, length, description
    FROM film 
    WHERE title LIKE '%LOVE%'
    """).df()
movie_title_love.index = movie_title_love.index + 1
display(movie_title_love)


Here is the movies that have the word love in the title:


Unnamed: 0,title,rating,length,description
1,GRAFFITI LOVE,PG,117,A Unbelieveable Epistle of a Sumo Wrestler And...
2,IDAHO LOVE,PG-13,172,A Fast-Paced Drama of a Student And a Crocodil...
3,IDENTITY LOVER,PG-13,119,A Boring Tale of a Composer And a Mad Cow who ...
4,INDIAN LOVE,NC-17,135,A Insightful Saga of a Mad Scientist And a Mad...
5,LAWRENCE LOVE,NC-17,175,A Fanciful Yarn of a Database Administrator An...
6,LOVE SUICIDES,R,181,A Brilliant Panorama of a Hunter And a Explore...
7,LOVELY JINGLE,PG,65,A Fanciful Yarn of a Crocodile And a Forensic ...
8,LOVER TRUMAN,G,75,A Emotional Yarn of a Robot And a Boy who must...
9,LOVERBOY ATTACKS,PG-13,162,A Boring Story of a Car And a Butler who must ...
10,STRANGELOVE DESIRE,NC-17,103,A Awe-Inspiring Panorama of a Lumberjack And a...


In [96]:
# 1c

print("Calculation of the length: Longest, shortest, median and average movie length:")
# My calculation for the printed statistics above

movie_length_calc = duckdb.sql("""
    SELECT
        MAX(length) as L_movie_stats,
        MIN(length) as S_movie_stats,
        MEDIAN(length) as M_length,
        AVG(length) as Avg_movie_stats
    FROM film
    """).df()
movie_length_calc.index = movie_length_calc.index + 1
display(round(movie_length_calc,))
# i added the round function to remove the decimals from the average value for cleaner look

Calculation of the length: Longest, shortest, median and average movie length:


Unnamed: 0,L_movie_stats,S_movie_stats,M_length,Avg_movie_stats
1,185,46,114.0,115.0


In [None]:
# 1d

print(" This is the TOP 10 most expensive movies to rent per day (rental Rates)")
# I select all the needed columns and divide the rental rate with the rental duration, 

most_expensive_movies = duckdb.sql("""
    SELECT  
    title,
    rental_rate,
    rental_duration,
    (rental_rate / rental_duration) as cost_per_day
FROM film 
ORDER BY cost_per_day DESC
LIMIT 10

                                   
    
     """).df()

most_expensive_movies.index = most_expensive_movies.index + 1
display(most_expensive_movies)

 This is the TOP 10 most expensive movies to rent per day (rental Rates)


Unnamed: 0,title,rental_rate,rental_duration,cost_per_day
1,AMERICAN CIRCUS,4.99,3,1.663333
2,BACKLASH UNDEFEATED,4.99,3,1.663333
3,BILKO ANONYMOUS,4.99,3,1.663333
4,BEAST HUNCHBACK,4.99,3,1.663333
5,CARIBBEAN LIBERTY,4.99,3,1.663333
6,AUTUMN CROW,4.99,3,1.663333
7,CASPER DRAGONFLY,4.99,3,1.663333
8,ACE GOLDFINGER,4.99,3,1.663333
9,BEHAVIOR RUNAWAY,4.99,3,1.663333
10,CASUALTIES ENCINO,4.99,3,1.663333


In [None]:
# 1e
print(" This displays the actors that have played in most movies, showing the top 10")

actors_most_movies = duckdb.sql("""
SELECT 
    a.first_name,
    a.last_name,
    COUNT(fa.film_id) as movie_counter
    FROM actor a
    JOIN film_actor fa ON a.actor_id = fa.actor_id
    GROUP BY a.actor_id, a.first_name, a.last_name
    ORDER BY movie_counter DESC
    LIMIT 10
""").df()

# First of all i pick the first name and last name from the actor table. And i name it (a) for actor)
# And after that i used a join and this is so i can put the specific actor with the right movie. 
# # Then i use group by to count the right amount per actor

actors_most_movies.index = actors_most_movies.index + 1
display(actors_most_movies)