In [1]:
import requests
import csv 

In [2]:
import sys 
sys.path.append('../../')

### Generate Markdown

In [3]:
from helper.generate_markdown import generate_markdown_text
generate_markdown_text('7.Add safety ranking column based on total incidents per miles flown.')

'7. [Add safety ranking column based on total incidents per miles flown.](#-7.Add-safety-ranking-column-based-on-total-incidents-per-miles-flown.)'

# Airline Safety Data

1. [Read in the data using requests and save the raw data.](#-1.Read-in-the-data-using-requests-and-save-the-raw-data.)
2. [Seperate header from the rest of the data.](#-2.Seperate-header-from-the-rest-of-the-data.)
3. [Using regex, clean up the airline_names.](#-4.Using-regex,-clean-up-the-airline_names.)
4. [Add avail_seat_mile_per_year column.](#-4.Add-avail_seat_mile_per_year-column.)
5. [Save the cleaned data as csv file.](#-5.Save-the-cleaned-data-as-csv-file.)
6. [Append total_incidents_per_billion_miles column using a helper function.](#-6.Append-total_incidents_per_billion_miles-column-using-a-helper-function.)
7. [Add safety ranking column based on total incidents per miles flown.](#-7.Add-safety-ranking-column-based-on-total-incidents-per-miles-flown.)
8. [Add safety ranking column based on total fatalities per miles flown.](#-8.Add-safety-ranking-column-based-on-total-fatalities-per-miles-flown.)

### 1.Read in the data using requests and save the raw data.

In [4]:
url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv'

# Send a GET request to the URL using the requests library, and store the response in a variable 'r'.
r = requests.get(url)

# Use csv.reader() to iterate over each line in the response using iter_lines() method 
# with decode_unicode=True to ensure the returned lines are in Unicode format.
data = [row for row in csv.reader(r.iter_lines(decode_unicode=True))]

# Print the first four rows 
data[0:4]

[['airline',
  'avail_seat_km_per_week',
  'incidents_85_99',
  'fatal_accidents_85_99',
  'fatalities_85_99',
  'incidents_00_14',
  'fatal_accidents_00_14',
  'fatalities_00_14'],
 ['Aer Lingus', '320906734', '2', '0', '0', '0', '0', '0'],
 ['Aeroflot*', '1197672318', '76', '14', '128', '6', '1', '88'],
 ['Aerolineas Argentinas', '385803648', '6', '0', '0', '1', '0', '0']]

In [5]:
# Write the data using context manager
with open('../data/raw/airline_safety_raw.csv', 'w') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    for row in data:
        csv_writer.writerow(row)

[BACK TO TOP][def]

[def]: #-Airline-Safety-Data

### 2.Seperate header from the rest of the data.

In [6]:
# Seperate the header from the rest of data
header, data = data[0], data[1:]

header

['airline',
 'avail_seat_km_per_week',
 'incidents_85_99',
 'fatal_accidents_85_99',
 'fatalities_85_99',
 'incidents_00_14',
 'fatal_accidents_00_14',
 'fatalities_00_14']

### 3.Using regex, clean up the airline_names.

In [7]:
# Create a list of airline names
airlines = []
for row in data:
    airline = row[0]
    airlines.append(airline)  

airlines 

['Aer Lingus',
 'Aeroflot*',
 'Aerolineas Argentinas',
 'Aeromexico*',
 'Air Canada',
 'Air France',
 'Air India*',
 'Air New Zealand*',
 'Alaska Airlines*',
 'Alitalia',
 'All Nippon Airways',
 'American*',
 'Austrian Airlines',
 'Avianca',
 'British Airways*',
 'Cathay Pacific*',
 'China Airlines',
 'Condor',
 'COPA',
 'Delta / Northwest*',
 'Egyptair',
 'El Al',
 'Ethiopian Airlines',
 'Finnair',
 'Garuda Indonesia',
 'Gulf Air',
 'Hawaiian Airlines',
 'Iberia',
 'Japan Airlines',
 'Kenya Airways',
 'KLM*',
 'Korean Air',
 'LAN Airlines',
 'Lufthansa*',
 'Malaysia Airlines',
 'Pakistan International',
 'Philippine Airlines',
 'Qantas*',
 'Royal Air Maroc',
 'SAS*',
 'Saudi Arabian',
 'Singapore Airlines',
 'South African',
 'Southwest Airlines',
 'Sri Lankan / AirLanka',
 'SWISS*',
 'TACA',
 'TAM',
 'TAP - Air Portugal',
 'Thai Airways',
 'Turkish Airlines',
 'United / Continental*',
 'US Airways / America West*',
 'Vietnam Airlines',
 'Virgin Atlantic',
 'Xiamen Airlines']

In [8]:
import re 

# Remove * from airline names
pattern = '\*'
airlines = [re.sub(pattern, '', airline) for airline in airlines]
airlines

['Aer Lingus',
 'Aeroflot',
 'Aerolineas Argentinas',
 'Aeromexico',
 'Air Canada',
 'Air France',
 'Air India',
 'Air New Zealand',
 'Alaska Airlines',
 'Alitalia',
 'All Nippon Airways',
 'American',
 'Austrian Airlines',
 'Avianca',
 'British Airways',
 'Cathay Pacific',
 'China Airlines',
 'Condor',
 'COPA',
 'Delta / Northwest',
 'Egyptair',
 'El Al',
 'Ethiopian Airlines',
 'Finnair',
 'Garuda Indonesia',
 'Gulf Air',
 'Hawaiian Airlines',
 'Iberia',
 'Japan Airlines',
 'Kenya Airways',
 'KLM',
 'Korean Air',
 'LAN Airlines',
 'Lufthansa',
 'Malaysia Airlines',
 'Pakistan International',
 'Philippine Airlines',
 'Qantas',
 'Royal Air Maroc',
 'SAS',
 'Saudi Arabian',
 'Singapore Airlines',
 'South African',
 'Southwest Airlines',
 'Sri Lankan / AirLanka',
 'SWISS',
 'TACA',
 'TAM',
 'TAP - Air Portugal',
 'Thai Airways',
 'Turkish Airlines',
 'United / Continental',
 'US Airways / America West',
 'Vietnam Airlines',
 'Virgin Atlantic',
 'Xiamen Airlines']

In [9]:
for row in data:
    # Define a regular expression pattern that matches either an asterisk (*) 
    # or a forward slash (/) or a whitespace (\s)
    pattern = r'[\*\/\s]'

    # Use the re.sub() function to replace all occurrences of the pattern 
    # in the string. Using a lambda function to determine 
    # the replacement string based on the matched character
    row[0] = re.sub(pattern, lambda match: '_' if match.group() == '/' else '', row[0])

# Inspect the first 5 columns
data[:5]


[['AerLingus', '320906734', '2', '0', '0', '0', '0', '0'],
 ['Aeroflot', '1197672318', '76', '14', '128', '6', '1', '88'],
 ['AerolineasArgentinas', '385803648', '6', '0', '0', '1', '0', '0'],
 ['Aeromexico', '596871813', '3', '1', '64', '5', '0', '0'],
 ['AirCanada', '1865253802', '2', '0', '0', '2', '0', '0']]

[BACK TO TOP][def]

[def]: #-Airline-Safety-Data

### 4.Add avail_seat_mile_per_year column.

In [10]:
# Add 'avail_seat_mile_per_year' column
header.append('avail_seat_mile_per_year')

In [11]:
header

['airline',
 'avail_seat_km_per_week',
 'incidents_85_99',
 'fatal_accidents_85_99',
 'fatalities_85_99',
 'incidents_00_14',
 'fatal_accidents_00_14',
 'fatalities_00_14',
 'avail_seat_mile_per_year']

In [12]:
# Convert km_to_mil: 1 kilometer is equal to 0.621371 miles.
# Convert week_to_year: There are 52 weeks in a year.
for row in data:
    col = row[1] # curent avail_seat_km_per_week value
    new_col= int(col) * 0.621371 * 52
    print(new_col)
    break

10368911187.040329


In [13]:
# Define a lambda function that returns the convertion result as an integer.
convert_to_mile = lambda x: int(x) * 0.621371 * 52
# Apply the function to the avail_seat_km_per_week col
miles_per_year = [convert_to_mile(col[1]) for col in data]
# Inspect the first five rows
miles_per_year[0:5]

[10368911187.040329,
 38698339987.21486,
 12465814325.193216,
 19285699436.412395,
 60268760250.53219]

In [14]:
# List comprehension for better performance for larger datasets
[row + [miles_per_year[index]] for index, row in enumerate(data)][0]

['AerLingus', '320906734', '2', '0', '0', '0', '0', '0', 10368911187.040329]

In [15]:
# Update the data list using for loop
for i,row in enumerate(miles_per_year):
    data[i].append(row)
data[0]

['AerLingus', '320906734', '2', '0', '0', '0', '0', '0', 10368911187.040329]

[BACK TO TOP][def]

[def]: #-Airline-Safety-Data

### 5.Save the cleaned data as csv file.

In [16]:
with open('../data/cleaned/airline_safety_clean.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header) # write header row first
    
    for row in data:
        writer.writerow(row) # iterate thru each row

[BACK TO TOP][def]

[def]: #-Airline-Safety-Data

### 6.Append total_incidents_per_billion_miles column using a helper function.

In [17]:
calculate_incidents_rate = lambda x: (int(x[2]) + int(x[5])) / int(x[-1]) * 1_000_000_000 

incidents_list =[ calculate_incidents_rate(row) for row in data]
# Inspect the first five rows
incidents_list[0:5]


[0.19288428301975385,
 2.1189539403381747,
 0.5615357182050761,
 0.41481513421632277,
 0.06636937583264789]

In [18]:
from airline_safety import add_column
add_column(col_name='total_incidents_per_billion_miles', col_values=incidents_list)


Added new column 'total_incidents_per_billion_miles' at position 9 to airline_safety_clean.csv file.


[BACK TO TOP][def]

[def]: #-Airline-Safety-Data

### 7.Add safety ranking column based on total incidents per miles flown.

In [19]:
# Read the data
with open('../../generate_data/data/cleaned/airline_safety_clean.csv', 'r') as csv_file:
    reader = csv.reader(csv_file)
    reader_data = list(reader)

In [20]:
for row in reader_data:
    print(row)
    break

['airline', 'avail_seat_km_per_week', 'incidents_85_99', 'fatal_accidents_85_99', 'fatalities_85_99', 'incidents_00_14', 'fatal_accidents_00_14', 'fatalities_00_14', 'avail_seat_mile_per_year', 'total_incidents_per_billion_miles']


In [21]:
# Extract and convert the string data from the last column to a list of floats
total_incidents_data = [float(row[-1]) for row in reader_data[1:]]

In [22]:
total_incidents_data[0:5]

[0.19288428301975385,
 2.1189539403381747,
 0.5615357182050761,
 0.41481513421632277,
 0.06636937583264789]

In [23]:
# Get the index value of airline with 0.06636937583264789 total incidents
total_incidents_data.index(0.06636937583264789)

4

In [24]:
# Add 1 to the index value since ranking index starts at 1 not 0
rank_list = [total_incidents_data.index(row)+1 
             for row in sorted(total_incidents_data)]

# Insect the first 20 values
rank_list[0:20]

[49, 16, 55, 42, 29, 24, 27, 5, 34, 44, 33, 38, 15, 31, 43, 52, 18, 11, 19, 13]

In [25]:
add_column(col_name='safety_ranking_per_incident', col_values=rank_list, position=1)

Added new column 'safety_ranking_per_incident' at position 1 to airline_safety_clean.csv file.


[BACK TO TOP][def]

[def]: #-Airline-Safety-Data

### 8.Add safety ranking column based on total fatalities per miles flown.

In [26]:
# The fatality rate is calculated by adding the total number of fatalities in columns 4 and 7, 
# dividing by the total_incidents_per_billion_miles column and multiplying by 1 billion.
calculate_fatality_rate = lambda x: (int(x[4]) + int(x[7])) / float(x[-2]) * 1_000_000_000

In [27]:
total_fatalities =[ calculate_fatality_rate(row) for row in reader_data[1:]]
total_fatalities[0:10]

[0.0,
 5.581634769640299,
 0.0,
 3.3185210736596207,
 0.0,
 4.285867036767085,
 17.33916435168837,
 0.30505521088121107,
 2.8212722614618877,
 2.2169325578307957]

In [28]:
fatality_dict = {}

# Loop through each row of the reader_data starting from the second row
for i,row in enumerate(reader_data[1:]):
    name = row[0]   # Get the name from the first column of the row
    total_fatality = total_fatalities[i] # Get the corresponding total fatality from total_fatalities list
    index_rank = i+1  # Get the index rank by adding 1 to the current index
    fatality_dict[name] = (total_fatality, index_rank)
   
# Sort the fatality_dict by the total fatality value (first item in the tuple)
sorted_fatalities = sorted(fatality_dict.items(), key= lambda x: float(x[1][0]))
# Check the first 20 values
sorted_fatalities[0:20]


[('AerLingus', (0.0, 1)),
 ('AerolineasArgentinas', (0.0, 3)),
 ('AirCanada', (0.0, 5)),
 ('AustrianAirlines', (0.0, 13)),
 ('BritishAirways', (0.0, 15)),
 ('CathayPacific', (0.0, 16)),
 ('Finnair', (0.0, 24)),
 ('HawaiianAirlines', (0.0, 27)),
 ('Qantas', (0.0, 38)),
 ('SouthwestAirlines', (0.0, 44)),
 ('TAP-AirPortugal', (0.0, 49)),
 ('VirginAtlantic', (0.0, 55)),
 ('AllNipponAirways', (0.016808797619760923, 11)),
 ('Lufthansa', (0.01806430244692273, 34)),
 ('KLM', (0.04952986841652627, 31)),
 ('AirNewZealand', (0.30505521088121107, 8)),
 ('ElAl', (0.3690459389206782, 22)),
 ('LANAirlines', (0.6486524057607679, 33)),
 ('TACA', (0.7159316821655235, 47)),
 ('SingaporeAirlines', (1.1588640263688075, 42))]

In [29]:
# loop through each tuple in sorted_fatalities and 
# get the second item in the tuple (the index rank)
fatality_ranking_list = [index_rank for _,(_,index_rank) in sorted_fatalities]

# check the first 10 values
fatality_ranking_list[0:10]


[1, 3, 5, 13, 15, 16, 24, 27, 38, 44]

In [30]:
add_column(col_name='safety_ranking_per_fatality', 
           col_values=fatality_ranking_list, 
           position=2)

Added new column 'safety_ranking_per_fatality' at position 2 to airline_safety_clean.csv file.


[BACK TO TOP][def]

[def]: #-Airline-Safety-Data

# THE END