# **Reporting Data**

In [1]:
import numpy as np
import pandas as pd

### **Create Fake Information**

In [16]:
# Datos para el DataFrame
airlines = ['LATAM', 'Avianca', 'Gol', 'Aerolineas Argentinas', 'United', 'American']
# Códigos de aeropuertos de Sudamérica y algunos de conexión
airports = ['GRU', 'EZE', 'BOG', 'LIM', 'SCL', 'GIG', 'JFK', 'MIA']
statuses = ['On Time', 'Delayed', 'Cancelled']
num_observations = 35

data = {
    'flight_id': [f'FL{i:03}' for i in range(1, num_observations + 1)],
    'airline': np.random.choice(airlines, num_observations),
    'origin_airport': np.random.choice(airports, num_observations),
    'destination_airport': np.random.choice(airports, num_observations),
    'passengers': np.random.randint(100, 250, size=num_observations),
    'status': np.random.choice(statuses, num_observations, p=[0.7, 0.2, 0.1]),
    'price': np.random.randint(200, 1500, size=num_observations),
    'flight_date': pd.to_datetime(pd.date_range(start='2025-01-01', periods=num_observations, freq='D'))
}

# Asegurarse de que el origen y el destino no sean el mismo para cada vuelo
for i in range(num_observations):
    while data['origin_airport'][i] == data['destination_airport'][i]:
        data['destination_airport'][i] = np.random.choice(airports)

df = pd.DataFrame(data)

df.head()

Unnamed: 0,flight_id,airline,origin_airport,destination_airport,passengers,status,price,flight_date
0,FL001,LATAM,GIG,EZE,214,On Time,597,2025-01-01
1,FL002,United,GIG,GRU,175,On Time,543,2025-01-02
2,FL003,United,GRU,EZE,160,On Time,701,2025-01-03
3,FL004,United,LIM,BOG,230,Delayed,1305,2025-01-04
4,FL005,Avianca,EZE,MIA,115,On Time,1386,2025-01-05


## **Group By**

**See the average price by airline**

In [31]:
avg_price_airline = df.groupby("airline")["price"].mean()
avg_price_airline

airline
Aerolineas Argentinas     696.333333
American                 1076.000000
Avianca                   886.000000
Gol                      1007.333333
LATAM                     648.666667
United                    815.571429
Name: price, dtype: float64

**See the number of passengers by airline and status flight**

In [34]:
num_passengers = df.groupby( ["airline", "status"] )["passengers"].sum()
num_passengers

airline                status   
Aerolineas Argentinas  Delayed       129
                       On Time       282
American               Delayed       155
                       On Time      1027
Avianca                Delayed       204
                       On Time       856
Gol                    Cancelled     138
                       Delayed       454
                       On Time       267
LATAM                  Cancelled     310
                       Delayed       189
                       On Time       496
United                 Delayed       409
                       On Time       956
Name: passengers, dtype: int32

**Give a report about the avg of passengers and avg price by airline**

In [36]:
summary = df.groupby("airline")[["passengers", "price"]].mean()
summary

Unnamed: 0_level_0,passengers,price
airline,Unnamed: 1_level_1,Unnamed: 2_level_1
Aerolineas Argentinas,137.0,696.333333
American,168.857143,1076.0
Avianca,176.666667,886.0
Gol,143.166667,1007.333333
LATAM,165.833333,648.666667
United,195.0,815.571429


In [40]:
summary1 = df.groupby("origin_airport").agg(
    {
        "price": "mean",
        "passengers": "sum"
    }
)

summary1

Unnamed: 0_level_0,price,passengers
origin_airport,Unnamed: 1_level_1,Unnamed: 2_level_1
BOG,947.833333,1119
EZE,888.75,613
GIG,772.125,1350
GRU,873.0,677
JFK,614.0,308
LIM,1105.333333,601
MIA,808.75,669
SCL,1006.333333,535


In [41]:
summary1 = df.groupby("origin_airport").agg(
    avg_price = ("price", "mean"),
    total_passengers = ("passengers", "sum")
)

summary1

Unnamed: 0_level_0,avg_price,total_passengers
origin_airport,Unnamed: 1_level_1,Unnamed: 2_level_1
BOG,947.833333,1119
EZE,888.75,613
GIG,772.125,1350
GRU,873.0,677
JFK,614.0,308
LIM,1105.333333,601
MIA,808.75,669
SCL,1006.333333,535


**How many flights does each airline have per airport?**

In [43]:
summary2 = df.groupby(["airline", "origin_airport"]).agg({
    "origin_airport": "count"
})

summary2

Unnamed: 0_level_0,Unnamed: 1_level_0,origin_airport
airline,origin_airport,Unnamed: 2_level_1
Aerolineas Argentinas,GIG,1
Aerolineas Argentinas,GRU,2
American,BOG,1
American,EZE,1
American,GIG,1
American,JFK,1
American,MIA,1
American,SCL,2
Avianca,BOG,1
Avianca,EZE,1


## **Pivot Tables***

In [45]:
avg_price_by_airline = df.pivot_table(
    index='airline',
    values='price',
    aggfunc='mean'
)

avg_price_by_airline

Unnamed: 0_level_0,price
airline,Unnamed: 1_level_1
Aerolineas Argentinas,696.333333
American,1076.0
Avianca,886.0
Gol,1007.333333
LATAM,648.666667
United,815.571429


In [47]:
avg_price_by_route = df.pivot_table(
    index=['airline', 'origin_airport'],
    values='price',
    aggfunc='mean'
)

avg_price_by_route

Unnamed: 0_level_0,Unnamed: 1_level_0,price
airline,origin_airport,Unnamed: 2_level_1
Aerolineas Argentinas,GIG,336.0
Aerolineas Argentinas,GRU,876.5
American,BOG,1059.0
American,EZE,1232.0
American,GIG,1290.0
American,JFK,966.0
American,MIA,791.0
American,SCL,1097.0
Avianca,BOG,1325.0
Avianca,EZE,1386.0


In [49]:
passenger_summary = df.pivot_table(
    index='airline',
    values='passengers',
    aggfunc=['mean', 'sum']
)

passenger_summary

Unnamed: 0_level_0,mean,sum
Unnamed: 0_level_1,passengers,passengers
airline,Unnamed: 1_level_2,Unnamed: 2_level_2
Aerolineas Argentinas,137.0,411
American,168.857143,1182
Avianca,176.666667,1060
Gol,143.166667,859
LATAM,165.833333,995
United,195.0,1365


In [53]:
passenger_summary = df.pivot_table(
    index='airline',
    values= ["passengers", "price"],
    aggfunc={"passengers":'sum', "price":"mean"}
)
passenger_summary

Unnamed: 0_level_0,passengers,price
airline,Unnamed: 1_level_1,Unnamed: 2_level_1
Aerolineas Argentinas,411,696.333333
American,1182,1076.0
Avianca,1060,886.0
Gol,859,1007.333333
LATAM,995,648.666667
United,1365,815.571429


---