# 2024 week 2: Average Price Analysis

https://preppindata.blogspot.com/2024/01/2024-week-2-average-price-analysis.html

## Solution

In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True

In [2]:
flow_card_yes = pd.read_csv("data/input_flow_card_yes.csv", parse_dates=["Date"], date_format="%d/%m/%Y")
flow_card_yes.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,2024-07-22,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free
1,2024-04-20,PA002,New York,London,Economy,3490.0,Yes,1,Vegan
2,2024-01-23,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian
3,2024-06-05,PA006,Tokyo,London,First Class,618.0,Yes,3,Vegan
4,2024-03-30,PA004,Perth,London,First Class,446.0,Yes,1,Nut Free


In [3]:
flow_card_no = pd.read_csv("data/input_flow_card_no.csv", parse_dates=["Date"], date_format="%d/%m/%Y")
flow_card_no.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,2024-09-28,PA008,Perth,New York,Economy,1855.0,No,2,Vegetarian
1,2024-10-01,PA008,Perth,New York,Business Class,634.8,No,0,Vegetarian
2,2024-03-04,PA007,New York,Perth,Business Class,458.4,No,3,Nut Free
3,2024-02-25,PA010,Tokyo,New York,Premium Economy,1435.0,No,0,
4,2024-03-29,PA004,Perth,London,Economy,2730.0,No,2,Vegan


In [4]:
# Union the two input files
df = pd.concat((flow_card_yes, flow_card_no))
df

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,2024-07-22,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free
1,2024-04-20,PA002,New York,London,Economy,3490.0,Yes,1,Vegan
2,2024-01-23,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian
3,2024-06-05,PA006,Tokyo,London,First Class,618.0,Yes,3,Vegan
4,2024-03-30,PA004,Perth,London,First Class,446.0,Yes,1,Nut Free
...,...,...,...,...,...,...,...,...,...
1890,2024-03-06,PA006,Tokyo,London,Premium Economy,940.0,No,2,Vegetarian
1891,2024-05-05,PA009,New York,Tokyo,Economy,1360.0,No,3,Nut Free
1892,2024-06-14,PA008,Perth,New York,First Class,245.0,No,1,Dairy Free
1893,2024-01-16,PA010,Tokyo,New York,Economy,2410.0,No,2,Egg Free


In [5]:
df["Quarter"] = df["Date"].dt.quarter
df.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type,Quarter
0,2024-07-22,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free,3
1,2024-04-20,PA002,New York,London,Economy,3490.0,Yes,1,Vegan,2
2,2024-01-23,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian,1
3,2024-06-05,PA006,Tokyo,London,First Class,618.0,Yes,3,Vegan,2
4,2024-03-30,PA004,Perth,London,First Class,446.0,Yes,1,Nut Free,1


In [6]:
# Aggregate by various fields
grouped = df.groupby(["Flow Card?", "Quarter", "Class"], as_index=False)

In [7]:
# Calculate median prices by group
median_prices_by_group = grouped["Price"].median()
median_prices_by_group

Unnamed: 0,Flow Card?,Quarter,Class,Price
0,No,1,Business Class,574.8
1,No,1,Economy,2340.0
2,No,1,First Class,438.0
3,No,1,Premium Economy,1075.0
4,No,2,Business Class,553.8
5,No,2,Economy,2325.0
6,No,2,First Class,445.0
7,No,2,Premium Economy,1205.0
8,No,3,Business Class,490.8
9,No,3,Economy,2285.0


In [8]:
# Calculate minimum prices by group
minimum_prices_by_group = grouped["Price"].min()
minimum_prices_by_group

Unnamed: 0,Flow Card?,Quarter,Class,Price
0,No,1,Business Class,241.2
1,No,1,Economy,1030.0
2,No,1,First Class,204.0
3,No,1,Premium Economy,515.0
4,No,2,Business Class,240.0
5,No,2,Economy,1000.0
6,No,2,First Class,202.0
7,No,2,Premium Economy,507.5
8,No,3,Business Class,240.0
9,No,3,Economy,1000.0


In [9]:
# Calculate maximum prices by group
maximum_prices_by_group = grouped["Price"].max()
maximum_prices_by_group

Unnamed: 0,Flow Card?,Quarter,Class,Price
0,No,1,Business Class,834.0
1,No,1,Economy,3455.0
2,No,1,First Class,699.0
3,No,1,Premium Economy,1702.5
4,No,2,Business Class,828.0
5,No,2,Economy,3480.0
6,No,2,First Class,694.0
7,No,2,Premium Economy,1745.0
8,No,3,Business Class,838.8
9,No,3,Economy,3475.0


In [10]:
# Pivot on `Class` field
median_prices_by_group_pivoted = median_prices_by_group.pivot(
    columns="Class", index=["Flow Card?", "Quarter"], values="Price"
).reset_index().rename_axis(columns=None)
median_prices_by_group_pivoted

Unnamed: 0,Flow Card?,Quarter,Business Class,Economy,First Class,Premium Economy
0,No,1,574.8,2340.0,438.0,1075.0
1,No,2,553.8,2325.0,445.0,1205.0
2,No,3,490.8,2285.0,487.0,1125.0
3,No,4,555.6,2202.5,428.0,1062.5
4,Yes,1,523.2,2325.0,447.5,1160.0
5,Yes,2,517.8,2290.0,459.0,1071.25
6,Yes,3,553.8,2347.5,457.0,1090.0
7,Yes,4,522.6,2212.5,424.0,1108.75


In [11]:
# Pivot on `Class` field
minimum_prices_by_group_pivoted = minimum_prices_by_group.pivot(
    columns="Class", index=["Flow Card?", "Quarter"], values="Price"
).reset_index().rename_axis(columns=None)
minimum_prices_by_group_pivoted

Unnamed: 0,Flow Card?,Quarter,Business Class,Economy,First Class,Premium Economy
0,No,1,241.2,1030.0,204.0,515.0
1,No,2,240.0,1000.0,202.0,507.5
2,No,3,240.0,1000.0,201.0,517.5
3,No,4,240.0,1015.0,200.0,510.0
4,Yes,1,249.6,1020.0,201.0,502.5
5,Yes,2,240.0,1020.0,200.0,500.0
6,Yes,3,241.2,1005.0,206.0,502.5
7,Yes,4,249.6,1030.0,205.0,505.0


In [12]:
# Pivot on `Class` field
maximum_prices_by_group_pivoted = maximum_prices_by_group.pivot(
    columns="Class", index=["Flow Card?", "Quarter"], values="Price"
).reset_index().rename_axis(columns=None)
maximum_prices_by_group_pivoted

Unnamed: 0,Flow Card?,Quarter,Business Class,Economy,First Class,Premium Economy
0,No,1,834.0,3455.0,699.0,1702.5
1,No,2,828.0,3480.0,694.0,1745.0
2,No,3,838.8,3475.0,691.0,1747.5
3,No,4,835.2,3465.0,698.0,1730.0
4,Yes,1,840.0,3500.0,698.0,1737.5
5,Yes,2,840.0,3490.0,696.0,1737.5
6,Yes,3,840.0,3495.0,697.0,1750.0
7,Yes,4,834.0,3460.0,697.0,1722.5


In [13]:
# Union all data
df = pd.concat(
    (
        median_prices_by_group_pivoted,
        minimum_prices_by_group_pivoted,
        maximum_prices_by_group_pivoted,
    )
).reset_index(drop=True)
df

Unnamed: 0,Flow Card?,Quarter,Business Class,Economy,First Class,Premium Economy
0,No,1,574.8,2340.0,438.0,1075.0
1,No,2,553.8,2325.0,445.0,1205.0
2,No,3,490.8,2285.0,487.0,1125.0
3,No,4,555.6,2202.5,428.0,1062.5
4,Yes,1,523.2,2325.0,447.5,1160.0
5,Yes,2,517.8,2290.0,459.0,1071.25
6,Yes,3,553.8,2347.5,457.0,1090.0
7,Yes,4,522.6,2212.5,424.0,1108.75
8,No,1,241.2,1030.0,204.0,515.0
9,No,2,240.0,1000.0,202.0,507.5


In [14]:
# Rename and reorder columns
df = df.rename(
    columns={
        "Economy": "First",
        "First Class": "Economy",
        "Business Class": "Premium",
        "Premium Economy": "Business",
    }
)
sort_columns = [
    "Flow Card?",
    "Quarter",
    "Economy",
    "Premium",
    "Business",
    "First",
]
df = df[sort_columns].sort_values(sort_columns).reset_index(drop=True)
df

Unnamed: 0,Flow Card?,Quarter,Economy,Premium,Business,First
0,No,1,204.0,241.2,515.0,1030.0
1,No,1,438.0,574.8,1075.0,2340.0
2,No,1,699.0,834.0,1702.5,3455.0
3,No,2,202.0,240.0,507.5,1000.0
4,No,2,445.0,553.8,1205.0,2325.0
5,No,2,694.0,828.0,1745.0,3480.0
6,No,3,201.0,240.0,517.5,1000.0
7,No,3,487.0,490.8,1125.0,2285.0
8,No,3,691.0,838.8,1747.5,3475.0
9,No,4,200.0,240.0,510.0,1015.0


## Tests

In [15]:
import pandas.testing as pdt

In [16]:
expected_output = pd.read_csv("data/output.csv").sort_values(sort_columns).reset_index(drop=True)

In [17]:
pdt.assert_frame_equal(expected_output, df, check_dtype=False)