<b>Important:</b> Save prepared/modified datasets locally to <code>../data/prepared/</code>

<b>IMPORTANT++</b> Prevent data leakage by only applying frequency encoding and target encoding on THE TRAINING DATASET!!! Do it later


In [1]:
import sys
sys.path.insert(1, '../../../../utils')
from pandas import read_csv, DataFrame
from dslabs_functions import *


In [2]:
file_tag = "flight"
filename = "../../data/raw/Combined_Flights_2022_cleaned.csv"
data: DataFrame = read_csv(filename, na_values="")

In [3]:
# Create copy of data before encoding
data_enc = data.copy()

# First drop DestStateName and OriginStateName as they are obviously redundant
data_enc = data_enc.drop(columns=["DestStateName", "OriginStateName"])

In [4]:
vars_types: dict[str, list] = get_variable_types(data_enc)
symbolic: list[str] = vars_types["symbolic"]

for var in symbolic:
    unique_values = data_enc[var].nunique()
    print(f"Variable '{var}': {unique_values} unique values.")


Variable 'Airline': 21 unique values.
Variable 'Origin': 375 unique values.
Variable 'Dest': 375 unique values.
Variable 'Marketing_Airline_Network': 10 unique values.
Variable 'Operated_or_Branded_Code_Share_Partners': 14 unique values.
Variable 'IATA_Code_Marketing_Airline': 10 unique values.
Variable 'Operating_Airline': 21 unique values.
Variable 'IATA_Code_Operating_Airline': 21 unique values.
Variable 'Tail_Number': 5896 unique values.
Variable 'OriginCityName': 369 unique values.
Variable 'OriginState': 53 unique values.
Variable 'DestCityName': 369 unique values.
Variable 'DestState': 53 unique values.
Variable 'DepTimeBlk': 19 unique values.
Variable 'ArrTimeBlk': 19 unique values.


### Cyclic encoding of time blocks

In [5]:
# For time blocks, do cyclic encoding
timeblk_order: list[str] = [
    "0001-0559", "0600-0659", "0700-0759", "0800-0859", "0900-0959",
    "1000-1059", "1100-1159", "1200-1259", "1300-1359", "1400-1459",
    "1500-1559", "1600-1659", "1700-1759", "1800-1859", "1900-1959",
    "2000-2059", "2100-2159", "2200-2259", "2300-2359"
]

# Create mapping dictionaries
DepTimeBlk_values: dict[str, int] = {v: i for i, v in enumerate(timeblk_order)}
ArrTimeBlk_values: dict[str, int] = {v: i for i, v in enumerate(timeblk_order)}

# Apply the mappings
encoding: dict[str, dict] = {
    "DepTimeBlk": DepTimeBlk_values,
    "ArrTimeBlk": ArrTimeBlk_values,
}
data_enc = data_enc.replace(encoding)

encode_cyclic_variables(data_enc, ["DepTimeBlk", "ArrTimeBlk"])
data_enc.head()


Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,CRSDepTime,CRSElapsedTime,Distance,Year,Quarter,...,DestStateFips,DestWac,DepTimeBlk,CRSArrTime,ArrTimeBlk,DistanceGroup,DepTimeBlk_sin,DepTimeBlk_cos,ArrTimeBlk_sin,ArrTimeBlk_cos
0,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",GJT,DEN,False,1133,72.0,212.0,2022,2,...,8,82,6,1245,7,1,0.866,-0.5,0.643,-0.766
1,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",HRL,IAH,False,732,77.0,295.0,2022,2,...,48,74,2,849,3,2,0.643,0.766,0.866,0.5
2,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",DRO,DEN,False,1529,70.0,251.0,2022,2,...,8,82,10,1639,11,2,-0.342,-0.94,-0.643,-0.766
3,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",IAH,GPT,False,1435,90.0,376.0,2022,2,...,28,53,9,1605,11,2,0.0,-1.0,-0.643,-0.766
4,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",DRO,DEN,False,1135,70.0,251.0,2022,2,...,8,82,6,1245,7,2,0.866,-0.5,0.643,-0.766


### One-Hot-encoding
Applied on variables with less than <i>30</i> number of unique values (low cardinality)

In [6]:
# Get symbolic variables with less than 30 unique values
symbolic_low_card: list[str] = [
    var for var in symbolic if data_enc[var].nunique() < 30
]

# Dummify low cardinality symbolic variables
data_enc = dummify(data_enc, symbolic_low_card)
data_enc.head()

# Find remaining symbolic variables
symbolic_remaining: list[str] = [
    var for var in symbolic if var not in symbolic_low_card
]

print("Remaining symbolic variables after dummification:", symbolic_remaining)



Remaining symbolic variables after dummification: ['Origin', 'Dest', 'Tail_Number', 'OriginCityName', 'OriginState', 'DestCityName', 'DestState']


### Frequency encoding

In [7]:
# Encode Dest and Origin using frequency encoding
for var in ["Dest", "Origin", "OriginCityName", "DestCityName"]:
    freq_encoding: dict[str, float] = (
        data_enc[var].value_counts(normalize=True).to_dict()
    )
    data_enc[var] = data_enc[var].map(freq_encoding)

data_enc.head()



Unnamed: 0,FlightDate,Origin,Dest,Cancelled,CRSDepTime,CRSElapsedTime,Distance,Year,Quarter,Month,...,ArrTimeBlk_9,ArrTimeBlk_10,ArrTimeBlk_11,ArrTimeBlk_12,ArrTimeBlk_13,ArrTimeBlk_14,ArrTimeBlk_15,ArrTimeBlk_16,ArrTimeBlk_17,ArrTimeBlk_18
0,2022-04-04,0.000524,0.038765,False,1133,72.0,212.0,2022,2,4,...,False,False,False,False,False,False,False,False,False,False
1,2022-04-04,0.000651,0.021331,False,732,77.0,295.0,2022,2,4,...,False,False,False,False,False,False,False,False,False,False
2,2022-04-04,0.000523,0.038765,False,1529,70.0,251.0,2022,2,4,...,False,False,True,False,False,False,False,False,False,False
3,2022-04-04,0.021337,0.000558,False,1435,90.0,376.0,2022,2,4,...,False,False,True,False,False,False,False,False,False,False
4,2022-04-04,0.000523,0.038765,False,1135,70.0,251.0,2022,2,4,...,False,False,False,False,False,False,False,False,False,False


### Target encoding

In [8]:
# Encode Tail_Number by target mean encoding
target_mean_encoding: dict[str, float] = (
    data_enc.groupby("Tail_Number")["Cancelled"].mean().to_dict()
)

data_enc["Tail_Number"] = data_enc["Tail_Number"].map(target_mean_encoding)
data_enc.head()

Unnamed: 0,FlightDate,Origin,Dest,Cancelled,CRSDepTime,CRSElapsedTime,Distance,Year,Quarter,Month,...,ArrTimeBlk_9,ArrTimeBlk_10,ArrTimeBlk_11,ArrTimeBlk_12,ArrTimeBlk_13,ArrTimeBlk_14,ArrTimeBlk_15,ArrTimeBlk_16,ArrTimeBlk_17,ArrTimeBlk_18
0,2022-04-04,0.000524,0.038765,False,1133,72.0,212.0,2022,2,4,...,False,False,False,False,False,False,False,False,False,False
1,2022-04-04,0.000651,0.021331,False,732,77.0,295.0,2022,2,4,...,False,False,False,False,False,False,False,False,False,False
2,2022-04-04,0.000523,0.038765,False,1529,70.0,251.0,2022,2,4,...,False,False,True,False,False,False,False,False,False,False
3,2022-04-04,0.021337,0.000558,False,1435,90.0,376.0,2022,2,4,...,False,False,True,False,False,False,False,False,False,False
4,2022-04-04,0.000523,0.038765,False,1135,70.0,251.0,2022,2,4,...,False,False,False,False,False,False,False,False,False,False


### Hierarchical encoding of state name

In [9]:
# Print all unique values in OriginState and DestState
print("Unique values in 'OriginState':", data_enc["OriginState"].unique())
print("Unique values in 'DestState':", data_enc["DestState"].unique())

# Encode OriginState and DestState using hierarchical encoding based on state regions
state_region_mapping: dict[str, str] = {
    # Northeast
    "ME": "Northeast", "NH": "Northeast", "VT": "Northeast", "MA": "Northeast",
    "RI": "Northeast", "CT": "Northeast", "NY": "Northeast", "NJ": "Northeast",
    "PA": "Northeast",
    # Midwest
    "OH": "Midwest", "IN": "Midwest", "IL": "Midwest", "MI": "Midwest",
    "WI": "Midwest", "MN": "Midwest", "IA": "Midwest", "MO": "Midwest",
    "ND": "Midwest", "SD": "Midwest", "NE": "Midwest", "KS": "Midwest",
    # South
    "DE": "South", "MD": "South", "VA": "South", "WV": "South",
    "NC": "South",  "SC": "South",  "GA": "South",  "FL": "South",
    "KY": "South",  "TN": "South",  "MS": "South",  "AL": "South",
    "OK": "South",  "TX": "South",  "AR": "South",  "LA": "South",
    # West
    "ID": "West",  "MT": "West",  "WY": "West",  "NV":  "West",
    "UT":  "West",  "CO": "West", "AZ": "West", "NM": "West",
    "CA": "West", "OR": "West", "WA": "West",
    # Non-continental US
   "AK": "Non-continental US", "HI": "Non-continental US",
   "PR": "Non-continental US", "VI": "Non-continental US",
    # Unknown/Other
   "TT": "Unknown/Other"
}

data_enc["OriginState"] = data_enc["OriginState"].map(state_region_mapping)
data_enc["DestState"] = data_enc["DestState"].map(state_region_mapping)

data_enc.head()

Unique values in 'OriginState': ['CO' 'TX' 'TN' 'AL' 'NE' 'VA' 'FL' 'LA' 'NM' 'NY' 'OK' 'WY' 'NJ' 'MO'
 'KY' 'RI' 'NC' 'PA' 'IL' 'OH' 'ME' 'AR' 'WI' 'VT' 'MI' 'SC' 'MN' 'IA'
 'IN' 'KS' 'ND' 'MS' 'AZ' 'ID' 'GA' 'CT' 'WA' 'MA' 'CA' 'MD' 'MT' 'OR'
 'HI' 'NV' 'NH' 'PR' 'UT' 'SD' 'WV' 'VI' 'AK' 'DE' 'TT']
Unique values in 'DestState': ['CO' 'TX' 'MS' 'OK' 'LA' 'AL' 'WY' 'NM' 'ND' 'KS' 'MO' 'NE' 'NY' 'RI'
 'VA' 'IN' 'NC' 'NJ' 'PA' 'IL' 'SC' 'WI' 'VT' 'OH' 'MN' 'TN' 'MI' 'ME'
 'IA' 'KY' 'AR' 'FL' 'AZ' 'ID' 'GA' 'CA' 'NV' 'MD' 'UT' 'CT' 'MA' 'NH'
 'PR' 'MT' 'WA' 'OR' 'HI' 'SD' 'WV' 'VI' 'AK' 'DE' 'TT']


Unnamed: 0,FlightDate,Origin,Dest,Cancelled,CRSDepTime,CRSElapsedTime,Distance,Year,Quarter,Month,...,ArrTimeBlk_9,ArrTimeBlk_10,ArrTimeBlk_11,ArrTimeBlk_12,ArrTimeBlk_13,ArrTimeBlk_14,ArrTimeBlk_15,ArrTimeBlk_16,ArrTimeBlk_17,ArrTimeBlk_18
0,2022-04-04,0.000524,0.038765,False,1133,72.0,212.0,2022,2,4,...,False,False,False,False,False,False,False,False,False,False
1,2022-04-04,0.000651,0.021331,False,732,77.0,295.0,2022,2,4,...,False,False,False,False,False,False,False,False,False,False
2,2022-04-04,0.000523,0.038765,False,1529,70.0,251.0,2022,2,4,...,False,False,True,False,False,False,False,False,False,False
3,2022-04-04,0.021337,0.000558,False,1435,90.0,376.0,2022,2,4,...,False,False,True,False,False,False,False,False,False,False
4,2022-04-04,0.000523,0.038765,False,1135,70.0,251.0,2022,2,4,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# Dummify OriginState and DestState
data_enc = dummify(data_enc, ["OriginState", "DestState"])
data_enc.head()

Unnamed: 0,FlightDate,Origin,Dest,Cancelled,CRSDepTime,CRSElapsedTime,Distance,Year,Quarter,Month,...,OriginState_Northeast,OriginState_South,OriginState_Unknown/Other,OriginState_West,DestState_Midwest,DestState_Non-continental US,DestState_Northeast,DestState_South,DestState_Unknown/Other,DestState_West
0,2022-04-04,0.000524,0.038765,False,1133,72.0,212.0,2022,2,4,...,False,False,False,True,False,False,False,False,False,True
1,2022-04-04,0.000651,0.021331,False,732,77.0,295.0,2022,2,4,...,False,True,False,False,False,False,False,True,False,False
2,2022-04-04,0.000523,0.038765,False,1529,70.0,251.0,2022,2,4,...,False,False,False,True,False,False,False,False,False,True
3,2022-04-04,0.021337,0.000558,False,1435,90.0,376.0,2022,2,4,...,False,True,False,False,False,False,False,True,False,False
4,2022-04-04,0.000523,0.038765,False,1135,70.0,251.0,2022,2,4,...,False,False,False,True,False,False,False,False,False,True


In [11]:
# Check remaining symbolic variables
vars_types: dict[str, list] = get_variable_types(data_enc)
symbolic: list[str] = vars_types["symbolic"]

for var in symbolic:
    unique_values = data_enc[var].nunique()
    print(f"Variable '{var}': {unique_values} unique values.")

In [12]:
output_path = "../../data/prepared/encoding/flight_enc.csv"
data_enc.to_csv(output_path, index=False)
print(f"Saved Encoding dataset to: {output_path}")

Saved Encoding dataset to: ../../data/prepared/encoding/flight_enc.csv
