<a target="_blank" href="https://colab.research.google.com/github/ZHAW-ZAV/TSO-FS25-students/blob/main/01_python_basics/01_11_exercise_airports.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
import sys
import os

IN_COLAB = "google.colab" in sys.modules

file_id_airports = "1htJ9M2E6uiJSyQq0vvxIvUqIefwa-L73"
file_id_countries = "1me9ab-d8k1H0RVIq72U85Lr0p4Q1yhWn"
file_id_population = "1eWJVsEqM52xG-pLffdvkEy880oRsaTyz"

if IN_COLAB:
    path_to_airports = "/content/data/airports_europe.csv"
    os.makedirs(os.path.dirname(path_to_airports), exist_ok=True)
    !gdown "https://drive.google.com/uc?id={file_id_airports}" -O "{path_to_airports}"

    path_to_countries = "/content/data/iso_country_codes.json"
    os.makedirs(os.path.dirname(path_to_countries), exist_ok=True)
    !gdown "https://drive.google.com/uc?id={file_id_countries}" -O "{path_to_countries}"

    path_to_population = "/content/data/world_population.csv"
    os.makedirs(os.path.dirname(path_to_population), exist_ok=True)
    !gdown "https://drive.google.com/uc?id={file_id_population}" -O "{path_to_population}"
else:
    import gdown

    url = f"https://drive.google.com/uc?id={file_id_airports}"
    path_to_airports = "data/airports_europe.csv"
    os.makedirs(os.path.dirname(path_to_airports), exist_ok=True)
    gdown.download(url, path_to_airports, quiet=False)

    url = f"https://drive.google.com/uc?id={file_id_countries}"
    path_to_countries = "data/iso_country_codes.json"
    os.makedirs(os.path.dirname(path_to_countries), exist_ok=True)
    gdown.download(url, path_to_countries, quiet=False)

    url = f"https://drive.google.com/uc?id={file_id_population}"
    path_to_population = "data/world_population.csv"
    os.makedirs(os.path.dirname(path_to_population), exist_ok=True)
    gdown.download(url, path_to_population, quiet=False)


The code above loads the data, don't modify.

---------------

***Notebook starts here***

# Analyzing European Airports

In this exercise we use three publicly available datasets:
- **airports_europe.csv**: A dataset containing information about European airports. The data is provided by [OurAirports](https://ourairports.com/data/) and limited to European airports.
- **iso_country_codes.json**: Mapping of ISO 2 and 3 character country codes to country names and other information. The data is provided by [a user on GitHub](https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/tree/master).
- **world_population.csv**: World population by country and year provided by the [World Bank](https://data.worldbank.org/indicator/SP.POP.TOTL).

## Tasks
This exercise is guided and you will be asked to implement the analysis step by step. The tasks are:
1. Load the data
2. Familiarize yourself with the data
3. Analyze the types of airports per region
4. Analyze the airports per resident in each country

## Reading Data

In [None]:
# Import the pandas library
import pandas as pd

In [None]:
# Read the airport CSV file into a pandas dataframe.
# The variable 'path_to_airports' contains the path to the data file, use this variable to read the data
airports = pd.read_csv(path_to_airports)

In [None]:
# We also need a dataframe for the countries (.json file), read that file into a pandas dataframe.
# The variable 'path_to_countries' contains the path to the data file, use this variable to read the data
countries = pd.read_json(path_to_countries)

## Familiarize yourself with the data

In [None]:
# How many airports are in the dataframe?
print(f"The dataframe has {len(airports)} rows.")

In [None]:
# Display the first 5 rows in the airports dataframe
airports.head()

In [None]:
# Display the last 5 rows in the airports dataframe
airports.tail()

In [None]:
# What are the unique ISO country codes in the airports dataframe?
airports["iso_country"].unique()

In [None]:
# How many countries are in the airports dataframe?
print(
    f"We have data for {len(airports['iso_country'].unique())} countries in the dataframe."
)

In [None]:
# What are the different "type" of airports in the dataframe?
airports["type"].unique()

In [None]:
# Let's check out something a bit niche:
# - How many 'balloonport' exist in which country?
# - How many 'seaplane_base' exist in which country?

print("Balloonports:")
print(
    airports.query("type == 'balloonport'")
    .groupby("iso_country")["id"]
    .count()
    .sort_values(ascending=False)
)
print("Seaplane bases:")
print(
    airports.query("type == 'seaplane_base'")
    .groupby("iso_country")["id"]
    .count()
    .sort_values(ascending=False)
)
# Note that we use the column 'id' to count the number of airports, but we could use any other column that is not None (N/A) in
# every column as well.

# Alternative, you can use the value_counts() method to get the same result:
# print("Balloonports:")
# print(
#     airports.query("type == 'balloonport'")["iso_country"].value_counts()
# )
# print("Seaplane bases:")
# print(
#     airports.query("type == 'seaplane_base'")["iso_country"].value_counts()
# )

## Analysis of airports per region

In [None]:
# Let's have a brief look into the countries dataframe.
# Display the first 5 rows in the countries dataframe
countries.head()

In [None]:
# Join the airports and countries dataframes so we can do some analysis on the region of the airports.
# Two notes:
# - Make sure that the resulting dataframe has the same number of columns as the airports dataframe. You can
#   specify the join type (with the function argument 'how'), choose the type that keeps all the airports in the
#   resulting dataframe.
# - We only want to add the columns 'sub-region' and 'alpha-3' from the countries dataframe to the airports dataframe. Make sure
#   you select only that column.
#   You can drop the column 'alpha-2' from the resulting dataframe with the function 'drop'.
airports_with_country = airports.merge(
    countries[["alpha-2", "sub-region", "alpha-3"]],
    left_on="iso_country",
    right_on="alpha-2",
    how="left",
)
airports_with_country = airports_with_country.drop(columns=["alpha-2"])
print(
    f"The airports dataframe has {len(airports)} columns and the joined dataframe has {len(airports_with_country)} columns."
)
airports_with_country.head()

In [None]:
# How many airports are in each region?
airports_with_country.groupby("sub-region")["id"].count().sort_values(ascending=False)

# Alternative
# airports_with_country["sub-region"].value_counts()


In [None]:
# How many airports are in each region of each type?
#
# Hint: You can use the groupby function to group by multiple columns, for example:
# df.groupby(["col1", "col2"]).count()

airports_with_country.groupby(["sub-region", "type"])["id"].count()

## Airports per resident in each country

In [None]:
# Now for a different task: We want to make an analysis of the number of airports per resident in each country.
#
# For that, we need to do a couple of steps. First, we need to load a new dataset with the population of each country.
#
# Read the world population CSV file into a pandas dataframe.
# The variable 'path_to_population' contains the path to the data file, use this variable to read the data
#
# Hint: Before you load the file, have a look at it. You can see that the first couple of rows are not part of the data.
# You can provide the argument 'header' to read_csv() to skip the first n of rows.

pop = pd.read_csv(path_to_population, header=2)

In [None]:
# Display the first 5 rows in the population dataframe
pop.head()

In [None]:
# Looks like there are a lot of columns in the population dataframe that we don't need.
# Keep only the columns 'Country Name', 'Country Code', and the year 2023.

pop = pop[["Country Name", "Country Code", "2023"]]
pop.head()

In [None]:
# Next, we need to aggregate the number of airports per country in a new dataframe.
# Do the following:
# 1. Group the 'airports_with_country' dataframe by 'alpha-3' and count the number of airports.
# 2. Call 'reset_index()' on the resulting dataframe to get a new dataframe.
# 3. Rename the column with the number of airports per country to 'n_airports'. You can do that with the function
#    'rename(columns={"old_name": "new_name"})', where 'old_name' is the current column name and 'new_name' is the new
#    column name.

# 1.
n_airports_per_country = airports_with_country.groupby("alpha-3")["id"].count()

# 2.
n_airports_per_country = n_airports_per_country.reset_index()

# 3.
n_airports_per_country = n_airports_per_country.rename(columns={"id": "n_airports"})

# You could also do this in one go:
# n_airports_per_country = (
#     airports_with_country.groupby("alpha-3")["id"]
#     .count()
#     .reset_index()
#     .rename(columns={"id": "n_airports"})
# )

In [None]:
# Display the first 5 rows in the new dataframe
n_airports_per_country.head()

In [None]:
# Now, we need to merge the population dataframe with the number of airports dataframe.
# Do the following:
# 1. Merge the population dataframe with the number of airports dataframe.
# 2. Drop the column 'alpha-3' from the resulting dataframe.
# 3. Rename the column '2023' to 'population'.

n_airports_per_country = n_airports_per_country.merge(
    pop, left_on="alpha-3", right_on="Country Code"
)
n_airports_per_country = n_airports_per_country.drop(columns=["alpha-3"])
n_airports_per_country = n_airports_per_country.rename(columns={"2023": "population"})

In [None]:
# Display the first 5 rows in the new dataframe
n_airports_per_country.head()

In [None]:
# Add a new column to the dataframe that contains the number of airports per resident.
# Since number of airports per resident is a very small number, you might want to multiply it by 1,000,000 to get a more
# readable number.

n_airports_per_country["airports_per_resident"] = (
    n_airports_per_country["n_airports"]
    / n_airports_per_country["population"]
    * 1_000_000
)

In [None]:
# Nice, almost done.
# As a last step, sort the dataframe by the number of airports per resident in descending order and display it.
#
# Hint: You can use the function 'sort_values()' to sort the dataframe by a specific column and specify the argument
# 'ascending=False' to sort in descending order.

n_airports_per_country = n_airports_per_country.sort_values(
    "airports_per_resident", ascending=False
)
n_airports_per_country