# Flights Raw Data Pre-processing

**This note book pre-process the flights data from the landing folder and save it to the raw folder**

**Read in the flight data**

* Convert the flight data to scv format

In [1]:
import pandas as pd

# Load the .asc file as a DataFrame
flight_01 = pd.read_csv("../data/landing/flight_data_2020-01", delimiter='|')
flight_02 = pd.read_csv("../data/landing/flight_data_2020-02", delimiter='|')
flight_03 = pd.read_csv("../data/landing/flight_data_2020-03", delimiter='|')

# Save as csv to easier pre-process
flight_01.to_csv("../data/landing/flight_data_2020-01.csv")
flight_02.to_csv("../data/landing/flight_data_2020-02.csv")
flight_03.to_csv("../data/landing/flight_data_2020-03.csv")

  flight_01 = pd.read_csv("../data/landing/flight_data_2020-01", delimiter='|')
  flight_02 = pd.read_csv("../data/landing/flight_data_2020-02", delimiter='|')
  flight_03 = pd.read_csv("../data/landing/flight_data_2020-03", delimiter='|')


**Read in the flight data with csv format and merge them**

In [2]:
df_flight_01 = pd.read_csv("../data/landing/flight_data_2020-01.csv", header=None)
df_flight_02 = pd.read_csv("../data/landing/flight_data_2020-02.csv", header=None)
df_flight_03 = pd.read_csv("../data/landing/flight_data_2020-03.csv", header=None)

merge_df = pd.concat([df_flight_01, df_flight_02, df_flight_03], ignore_index=True)
merge_df = merge_df.iloc[1:]
merge_df.head()

  df_flight_01 = pd.read_csv("../data/landing/flight_data_2020-01.csv", header=None)
  df_flight_02 = pd.read_csv("../data/landing/flight_data_2020-02.csv", header=None)
  df_flight_03 = pd.read_csv("../data/landing/flight_data_2020-03.csv", header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
1,0.0,DL,5115,,,9E,5115.0,IAD,JFK,20200121,...,0.0,,,0.0,0.0,0.0,0.0,,FORM-1,N
2,1.0,DL,5115,,,9E,5115.0,IAD,JFK,20200122,...,0.0,,,0.0,0.0,0.0,0.0,,FORM-1,N
3,2.0,DL,5115,,,9E,5115.0,IAD,JFK,20200123,...,0.0,,,0.0,0.0,0.0,0.0,,FORM-1,N
4,3.0,DL,5115,,,9E,5115.0,IAD,JFK,20200124,...,0.0,,,0.0,0.0,0.0,0.0,,FORM-1,N
5,4.0,DL,5115,,,9E,5115.0,IAD,JFK,20200125,...,0.0,,,0.0,0.0,0.0,0.0,,FORM-1,N


**Drop Uneccessary Columns**

* Drop columns with object data type
* Drop columns with NaN


In [3]:
columns_to_keep = [0, 7, 8, 11, 13, 14, 16, 19]

merge_df = merge_df.iloc[:, columns_to_keep]
merge_df.head()

Unnamed: 0,0,7,8,11,13,14,16,19
1,0.0,IAD,JFK,600,554,729,710,89
2,1.0,IAD,JFK,600,552,729,707,89
3,2.0,IAD,JFK,600,600,729,736,89
4,3.0,IAD,JFK,600,554,729,711,89
5,4.0,IAD,JFK,600,557,724,716,84


**Rename the header**

* column 1 -> flight id
* column 2 -> departure_airport
* column 3 -> arrival_airport
* column 4 -> scheduled_departure_time
* column 5 -> actual_departure_time
* column 6 -> scheduled_arrival_time
* column 7 -> actual_arrival_time
* column 8 -> elapsed_time_flight_minutes

In [4]:
# Rename columns
merge_df.columns = ['flight_id','departure_airport','arrival_airport','scheduled_departure_time','actual_departure_time',
                    'scheduled_arrival_time','actual_arrival_time','elapsed_time_flight_minutes']

merge_df.head()


Unnamed: 0,flight_id,departure_airport,arrival_airport,scheduled_departure_time,actual_departure_time,scheduled_arrival_time,actual_arrival_time,elapsed_time_flight_minutes
1,0.0,IAD,JFK,600,554,729,710,89
2,1.0,IAD,JFK,600,552,729,707,89
3,2.0,IAD,JFK,600,600,729,736,89
4,3.0,IAD,JFK,600,554,729,711,89
5,4.0,IAD,JFK,600,557,724,716,84


**Drop NaN row**

In [5]:
merge_df = merge_df.dropna()

nan_counts = merge_df.isna().sum()
nan_counts

flight_id                      0
departure_airport              0
arrival_airport                0
scheduled_departure_time       0
actual_departure_time          0
scheduled_arrival_time         0
actual_arrival_time            0
elapsed_time_flight_minutes    0
dtype: int64

**Cast consistent schema**

In [6]:
merge_df.dtypes

flight_id                      float64
departure_airport               object
arrival_airport                 object
scheduled_departure_time         int64
actual_departure_time            int64
scheduled_arrival_time           int64
actual_arrival_time              int64
elapsed_time_flight_minutes      int64
dtype: object

In [7]:
# Convert flight_id to int
merge_df['flight_id'] = merge_df['flight_id'].astype(int)

# Convert departure_airport and arrival_airport to string
merge_df['departure_airport'] = merge_df['departure_airport'].astype(str)
merge_df['arrival_airport'] = merge_df['arrival_airport'].astype(str)

# Convert to time format
# Drop rows where the time has a value of length 1
merge_df = merge_df[merge_df['scheduled_departure_time'].astype(str).str.len() != 1]
merge_df = merge_df[merge_df['actual_departure_time'].astype(str).str.len() != 1]
merge_df = merge_df[merge_df['scheduled_arrival_time'].astype(str).str.len() != 1]
merge_df = merge_df[merge_df['actual_arrival_time'].astype(str).str.len() != 1]

# Convert to string and ensure they have a length of 4
time_columns = ['scheduled_departure_time', 'actual_departure_time', 'scheduled_arrival_time', 'actual_arrival_time']

for col in time_columns:
    merge_df[col] = merge_df[col].astype(str).str.zfill(4)

# Handle the "2400" case by replacing it with "0000"
merge_df[time_columns] = merge_df[time_columns].replace('2400', '0000')

# Convert to 'HH:MM' format and then to datetime.time
for col in time_columns:
    formatted_time = merge_df[col].str[:2] + ':' + merge_df[col].str[2:]
    merge_df[col] = pd.to_datetime(formatted_time, format='%H:%M').dt.time


# Convert elapsed_time to integer
merge_df['elapsed_time_flight_minutes'] = merge_df['elapsed_time_flight_minutes'].astype(int)

merge_df.dtypes

flight_id                       int64
departure_airport              object
arrival_airport                object
scheduled_departure_time       object
actual_departure_time          object
scheduled_arrival_time         object
actual_arrival_time            object
elapsed_time_flight_minutes     int64
dtype: object

**Save the data**

In [8]:
merge_df.to_csv('../data/raw/flights_raw_data.csv', index = False)