# EDA of Agoda's flights and hotel dataset using SQL

## Set Up

In [4]:
# Set up necessary imports
import pandas as pd
from sqlalchemy import create_engine

# Create an in-memory SQLite database engine
engine = create_engine('sqlite:///:memory:')

# Load the CSV data into a DataFrame
flights = pd.read_csv("data/flights.csv")
hotels = pd.read_csv("data/hotels.csv")
users = pd.read_csv("data/users.csv")

# Push the DataFrame to the SQLite database (creating the table)
flights.to_sql("flights", con=engine, if_exists="replace", index=False)
hotels.to_sql("hotels", con=engine, if_exists="replace", index=False)
users.to_sql("users", con=engine, if_exists="replace", index=False)

1340

## 1. Distribution of flight bookings by demographic factors

In [None]:
# Query the flight bookings by gender
query = """
SELECT 
    gender, 
    COUNT(*) AS number_of_bookings, 
    COUNT(*) * 100.0 / (SELECT COUNT(*) FROM flights) AS percentage
FROM flights 
INNER JOIN users 
    ON flights.userCode = users.code
GROUP BY gender
"""

result = pd.read_sql(query, con=engine)
result


Unnamed: 0,gender,number_of_bookings,percentage
0,female,91580,33.682987
1,male,91248,33.560878
2,none,89060,32.756135


In [10]:
# Query the flight bookings by age group
query = """
SELECT 
    floor(age/10)*10 AS bin_floor, 
    COUNT(*) AS number_of_bookings, 
    COUNT(*)*100/(SELECT COUNT(*) FROM flights) AS percentage
FROM flights 
INNER JOIN users 
ON flights.userCode = users.code
GROUP BY floor(age/10)*10
ORDER BY 1
"""

result = pd.read_sql(query, con=engine)
result


Unnamed: 0,bin_floor,number_of_bookings,percentage
0,20,52956,19
1,30,64670,23
2,40,61548,22
3,50,57260,21
4,60,35454,13


In [None]:
# Query the average flight price by age group
query = """
SELECT floor(age/10)*10 AS bin_floor, ROUND(AVG(price),2) AS avg_price
FROM flights 
INNER JOIN users 
ON flights.userCode = users.code
GROUP BY floor(age/10)*10
ORDER BY 1
"""

result = pd.read_sql(query, con=engine)
result


Unnamed: 0,bin_floor,avg_price
0,20,963.53
1,30,962.19
2,40,953.59
3,50,957.44
4,60,945.87


In [None]:
# Query the flight bookings and average price by flight type
query = """
SELECT flightType, COUNT(*) AS number_of_bookings, ROUND(AVG(price),2) AS avg_price
FROM flights
GROUP BY flightType
"""

result = pd.read_sql(query, con=engine)
result


Unnamed: 0,flightType,number_of_bookings,avg_price
0,economic,77466,658.44
1,firstClass,116418,1181.07
2,premium,78004,920.39


In [7]:
with open("query/1a_flightbookingsbygender.sql", "r", encoding="utf-8-sig") as file:
    query = file.read()

result = pd.read_sql(query, con=engine)
result


Unnamed: 0,gender,number_of_bookings,COUNT(*)*100/(SELECT COUNT(*) FROM flights)
0,female,91580,33
1,male,91248,33
2,none,89060,32
