## Creating my train.csv

In this jupyter notebook i will be creating my train.csv by using the techniques used in the previous 2 jupyter notebooks

    -> reading_data_from_docx.ipynb
    
    -> joining_weather_and_flight.ipynb

## Importing Libraries

In [1]:
from docx import Document
import json
import os
import numpy as np
import pandas as pd

## Helper functions from my reading_data_from_docx.ipynb

In [14]:
def read_docx_json(file_path):
    doc = Document(file_path)
    full_text = "\n".join([para.text for para in doc.paragraphs])
    try:
        data = json.loads(full_text)
        return data
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        return None
def process_folder(folder_path):
    all_data = []
    for i,file_name in enumerate(os.listdir(folder_path)):
        if file_name.endswith(".docx"):
            file_path = os.path.join(folder_path, file_name)
            print(f"Processing file: {file_name}")
            data = read_docx_json(file_path)
            if data:
                if isinstance(data, list):
                    all_data.extend(data)
                else:
                    print(f"File {file_name} does not contain a list; skipping.")
    return all_data
def process_parsed_data(parsed_data):
    records = []
    for i in range(len(parsed_data)):
        type_ = parsed_data[i].get('type',np.nan)
        status_ = parsed_data[i].get('status',np.nan)    
        try:
            departure_dict = parsed_data[i]['departure']
            departure_iata_code = departure_dict.get('iataCode', np.nan)
            departure_icao_code = departure_dict.get('icaoCode', np.nan)
            departure_terminal = departure_dict.get('terminal', np.nan)
            departure_gate = departure_dict.get('gate', np.nan)
            departure_scheduled_time = departure_dict.get('scheduledTime', np.nan)
            departure_estimated_time = departure_dict.get('estimatedTime', np.nan)
            departure_actual_time = departure_dict.get('actualTime', np.nan)
            departure_estimated_runway = departure_dict.get('estimatedRunway', np.nan)
            departure_actual_runway = departure_dict.get('actualRunway', np.nan)
        except:
            departure_iata_code = np.nan
            departure_icao_code = np.nan
            departure_terminal = np.nan
            departure_gate = np.nan
            departure_scheduled_time = np.nan
            departure_estimated_time = np.nan
            departure_actual_time = np.nan
            departure_estimated_runway = np.nan
            departure_actual_runway = np.nan
        try:
            arrival_dict = parsed_data[i]['arrival']
            arrival_iata_code = arrival_dict.get('iataCode', np.nan)
            arrival_icao_code = arrival_dict.get('icaoCode', np.nan)
            arrival_terminal = arrival_dict.get('terminal', np.nan)
            arrival_scheduled_time = arrival_dict.get('scheduledTime', np.nan)
            arrival_estimated_time = arrival_dict.get('estimatedTime', np.nan)
            arrival_actual_time = arrival_dict.get('actualTime', np.nan)
            arrival_estimated_runway = arrival_dict.get('estimatedRunway', np.nan)
            arrival_actual_runway = arrival_dict.get('actualRunway', np.nan)
        except:
            arrival_iata_code = np.nan
            arrival_icao_code = np.nan
            arrival_terminal = np.nan
            arrival_scheduled_time = np.nan
            arrival_estimated_time = np.nan
            arrival_actual_time = np.nan
            arrival_estimated_runway = np.nan
            arrival_actual_runway = np.nan
        try:
            airline_dict = parsed_data[i]['airline']
            airline_name = airline_dict.get('name',np.nan)
            airline_iata_code = airline_dict.get('iataCode',np.nan)
            airline_icao_code = airline_dict.get('icaoCode',np.nan)
        except:
            airline_name = np.nan
            airline_iata_code = np.nan
            airline_icao_code = np.nan
        try:
            flight_dict = parsed_data[i]['flight']
            flight_number = flight_dict.get('number',np.nan)
            flight_iataNumber = flight_dict.get('iataNumber',np.nan)
            flight_icaoNumber = flight_dict.get('icaoNumber',np.nan)
        except:
            flight_number = np.nan
            flight_iataNumber = np.nan
            flight_icaoNumber = np.nan
        try:
            code_shared_dict = parsed_data[i]['codeshared']
            code_shared_flag = 1
            code_shared_airline = code_shared_dict.get('airline', {}).get('name', np.nan)
        except:
            code_shared_flag = 0
            code_shared_airline = np.nan
        record = {
            "type": type_,
            "status": status_,
            "departure_iata_code": departure_iata_code,
            "departure_icao_code": departure_icao_code,
            "departure_terminal": departure_terminal,
            "departure_gate": departure_gate,
            "departure_scheduled_time": departure_scheduled_time,
            "departure_estimated_time": departure_estimated_time,
            "departure_actual_time": departure_actual_time,
            "departure_estimated_runway": departure_estimated_runway,
            "departure_actual_runway": departure_actual_runway,
            "arrival_iata_code": arrival_iata_code,
            "arrival_icao_code": arrival_icao_code,
            "arrival_terminal": arrival_terminal,
            "arrival_scheduled_time": arrival_scheduled_time,
            "arrival_estimated_time": arrival_estimated_time,
            "arrival_actual_time": arrival_actual_time,
            "arrival_estimated_runway": arrival_estimated_runway,
            "arrival_actual_runway": arrival_actual_runway,
            "airline_name": airline_name,
            "airline_iata_code": airline_iata_code,
            "airline_icao_code": airline_icao_code,
            "flight_number": flight_number,
            "flight_iataNumber": flight_iataNumber,
            "flight_icaoNumber": flight_icaoNumber,
            "code_shared_flag": code_shared_flag,
            "code_shared_airline": code_shared_airline,
        }
        records.append(record)
    df = pd.DataFrame(records)
    return df

In [15]:
def get_weather_df(folder_name = 'Weather_formatted'):
    weather_df_combined = pd.read_csv('combined_weather.csv')
    return weather_df_combined
def merge_dfs(main_df,weather_df):
    result_df = pd.merge(
        main_df,
        weather_df,
        on=['Month', 'Day'],
        how='left'
    )
    return result_df
def combine_weather_and_flight_data(df_new,weather_folder='Weather_formatted'):
    flight_data = df_new
    needed_features = ['flight_number','type','code_shared_flag','airline_name','status','departure_iata_code','departure_icao_code','departure_scheduled_time',
                    'departure_actual_time','arrival_iata_code','arrival_icao_code','arrival_estimated_time']
    filtered_flight_data = flight_data[needed_features]
    columns_to_convert_to_datetime = ['departure_scheduled_time','departure_actual_time','arrival_estimated_time']
    for col in columns_to_convert_to_datetime:
        filtered_flight_data[col] = pd.to_datetime(filtered_flight_data[col], errors='coerce')
    df = filtered_flight_data.copy()
    df['Month'] = df['departure_scheduled_time'].dt.strftime('%b')
    df['Day'] = df['departure_scheduled_time'].dt.day
    weather_df = get_weather_df(weather_folder)
    result_df = merge_dfs(df,weather_df=weather_df)
    return result_df

## Creation of the dataframe

In [16]:
folder_path = "Test"
parsed_data = process_folder(folder_path)

Processing file: 13.docx
Processing file: 18.docx
Processing file: 23.docx
Processing file: 33.docx
Processing file: 44.docx
Processing file: 42.docx
Processing file: 36.docx
Processing file: 63.docx
Processing file: 40.docx
Processing file: 20.docx
Processing file: 53.docx
Processing file: 10.docx
Processing file: 65.docx
Processing file: 51.docx
Processing file: 4.docx
Processing file: 9.docx
Processing file: 25.docx
Processing file: 17.docx
Processing file: 7.docx
Processing file: 62.docx
Processing file: 12.docx
Processing file: 8.docx
Processing file: 54.docx
Processing file: 55.docx
Processing file: 14.docx
Processing file: 2.docx
Processing file: 32.docx
Processing file: 49.docx
Processing file: 39.docx
Processing file: 67.docx
Processing file: 1.docx
Processing file: 45.docx
Processing file: 30.docx
Processing file: 46.docx
Processing file: 56.docx
Processing file: 26.docx
Processing file: 48.docx
Processing file: 47.docx
Processing file: 52.docx
Processing file: 19.docx
Proces

In [18]:
len(parsed_data)

14910

In [19]:
df = process_parsed_data(parsed_data=parsed_data)

In [20]:
df.shape

(14910, 27)

In [7]:
df = combine_weather_and_flight_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_flight_data[col] = pd.to_datetime(filtered_flight_data[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_flight_data[col] = pd.to_datetime(filtered_flight_data[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_flight_data[col] = pd.

In [9]:
df.drop(columns='departure_actual_time',inplace=True)

In [10]:
df['day_of_week'] = df['departure_scheduled_time'].dt.day_name()
df['hour_of_day'] = df['departure_scheduled_time'].dt.hour

In [11]:
df.drop(columns='arrival_estimated_time',inplace=True)

In [12]:
df.isnull().sum()

flight_number               0
type                        0
code_shared_flag            0
airline_name                0
status                      0
departure_iata_code         0
departure_icao_code         0
departure_scheduled_time    0
arrival_iata_code           0
arrival_icao_code           0
Month                       0
Day                         0
Unnamed: 0                  0
Temperature (°F)_max        0
Temperature (°F)_avg        0
Temperature (°F)_min        0
Dew Point (°F)_max          0
Dew Point (°F)_avg          0
Dew Point (°F)_min          0
Humidity (%)_max            0
Humidity (%)_avg            0
Humidity (%)_min            0
Wind Speed (mph)_max        0
Wind Speed (mph)_avg        0
Wind Speed (mph)_min        0
Pressure (in)_max           0
Pressure (in)_avg           0
Pressure (in)_min           0
day_of_week                 0
hour_of_day                 0
dtype: int64

## Saving dataframe to test.csv

In [13]:
df.to_csv('test.csv')