In [1]:
import requests
import csv 

In [2]:
import sys 
sys.path.append('../../')

### Generate Markdown

In [3]:
from helper.generate_markdown import generate_markdown_text
generate_markdown_text('5.Calculate and add average number of incidents per miles.')

'5. [Calculate and add average number of incidents per miles.](#-5.Calculate-and-add-average-number-of-incidents-per-miles.)'

# Airline Safety Data

1. [Read in the data using requests and save the raw data.](#-1.Read-in-the-data-using-requests-and-save-the-raw-data.)
2. [Seperate header from the rest of the data.](#-2.Seperate-header-from-the-rest-of-the-data.)
3. [Explore airline_names column values.](#-3.Explore-airline_names-column-values.)
4. [Using regex clean up the airline_names.](#-4.Using-regex-clean-up-the-airline_names.)
5. [Calculate and add average number of incidents per miles.](#-5.Calculate-and-add-average-number-of-incidents-per-miles.)

### 1.Read in the data using requests and save the raw data.

In [4]:
url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv'

# Send a GET request to the URL using the requests library, and store the response in a variable 'r'.
r = requests.get(url)

# Use csv.reader() to iterate over each line in the response using iter_lines() method 
# with decode_unicode=True to ensure the returned lines are in Unicode format.
data = [row for row in csv.reader(r.iter_lines(decode_unicode=True))]

# print the first four rows 
data[0:4]

[['airline',
  'avail_seat_km_per_week',
  'incidents_85_99',
  'fatal_accidents_85_99',
  'fatalities_85_99',
  'incidents_00_14',
  'fatal_accidents_00_14',
  'fatalities_00_14'],
 ['Aer Lingus', '320906734', '2', '0', '0', '0', '0', '0'],
 ['Aeroflot*', '1197672318', '76', '14', '128', '6', '1', '88'],
 ['Aerolineas Argentinas', '385803648', '6', '0', '0', '1', '0', '0']]

In [5]:
# write the data using context manager
with open('../data/raw/airline_safety_raw.csv', 'w') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    for row in data:
        csv_writer.writerow(row)

## 2.Seperate header from the rest of the data.

In [6]:
# seperate the header from the rest of data
header, data = data[0], data[1:]

header

['airline',
 'avail_seat_km_per_week',
 'incidents_85_99',
 'fatal_accidents_85_99',
 'fatalities_85_99',
 'incidents_00_14',
 'fatal_accidents_00_14',
 'fatalities_00_14']

## 3.Explore airline_names column values.

In [7]:
# create a list of airline names
airlines = []
for row in data:
    airline = row[0]
    airlines.append(airline)  

airlines 

['Aer Lingus',
 'Aeroflot*',
 'Aerolineas Argentinas',
 'Aeromexico*',
 'Air Canada',
 'Air France',
 'Air India*',
 'Air New Zealand*',
 'Alaska Airlines*',
 'Alitalia',
 'All Nippon Airways',
 'American*',
 'Austrian Airlines',
 'Avianca',
 'British Airways*',
 'Cathay Pacific*',
 'China Airlines',
 'Condor',
 'COPA',
 'Delta / Northwest*',
 'Egyptair',
 'El Al',
 'Ethiopian Airlines',
 'Finnair',
 'Garuda Indonesia',
 'Gulf Air',
 'Hawaiian Airlines',
 'Iberia',
 'Japan Airlines',
 'Kenya Airways',
 'KLM*',
 'Korean Air',
 'LAN Airlines',
 'Lufthansa*',
 'Malaysia Airlines',
 'Pakistan International',
 'Philippine Airlines',
 'Qantas*',
 'Royal Air Maroc',
 'SAS*',
 'Saudi Arabian',
 'Singapore Airlines',
 'South African',
 'Southwest Airlines',
 'Sri Lankan / AirLanka',
 'SWISS*',
 'TACA',
 'TAM',
 'TAP - Air Portugal',
 'Thai Airways',
 'Turkish Airlines',
 'United / Continental*',
 'US Airways / America West*',
 'Vietnam Airlines',
 'Virgin Atlantic',
 'Xiamen Airlines']

In [8]:
import re 
pattern = '\*'
airlines = [re.sub(pattern, '', airline) for airline in airlines]
airlines

['Aer Lingus',
 'Aeroflot',
 'Aerolineas Argentinas',
 'Aeromexico',
 'Air Canada',
 'Air France',
 'Air India',
 'Air New Zealand',
 'Alaska Airlines',
 'Alitalia',
 'All Nippon Airways',
 'American',
 'Austrian Airlines',
 'Avianca',
 'British Airways',
 'Cathay Pacific',
 'China Airlines',
 'Condor',
 'COPA',
 'Delta / Northwest',
 'Egyptair',
 'El Al',
 'Ethiopian Airlines',
 'Finnair',
 'Garuda Indonesia',
 'Gulf Air',
 'Hawaiian Airlines',
 'Iberia',
 'Japan Airlines',
 'Kenya Airways',
 'KLM',
 'Korean Air',
 'LAN Airlines',
 'Lufthansa',
 'Malaysia Airlines',
 'Pakistan International',
 'Philippine Airlines',
 'Qantas',
 'Royal Air Maroc',
 'SAS',
 'Saudi Arabian',
 'Singapore Airlines',
 'South African',
 'Southwest Airlines',
 'Sri Lankan / AirLanka',
 'SWISS',
 'TACA',
 'TAM',
 'TAP - Air Portugal',
 'Thai Airways',
 'Turkish Airlines',
 'United / Continental',
 'US Airways / America West',
 'Vietnam Airlines',
 'Virgin Atlantic',
 'Xiamen Airlines']

## 4.Using regex clean up the airline_names.

In [9]:
for row in data:
    # Define a regular expression pattern that matches either an asterisk (*) 
    # or a forward slash (/) or a whitespace (\s)
    pattern = r'[\*\/\s]'

    # Use the re.sub() function to replace all occurrences of the pattern 
    # in the string. Using a lambda function to determine 
    # the replacement string based on the matched character
    row[0] = re.sub(pattern, lambda match: '_' if match.group() == '/' else '', row[0])

data


[['AerLingus', '320906734', '2', '0', '0', '0', '0', '0'],
 ['Aeroflot', '1197672318', '76', '14', '128', '6', '1', '88'],
 ['AerolineasArgentinas', '385803648', '6', '0', '0', '1', '0', '0'],
 ['Aeromexico', '596871813', '3', '1', '64', '5', '0', '0'],
 ['AirCanada', '1865253802', '2', '0', '0', '2', '0', '0'],
 ['AirFrance', '3004002661', '14', '4', '79', '6', '2', '337'],
 ['AirIndia', '869253552', '2', '1', '329', '4', '1', '158'],
 ['AirNewZealand', '710174817', '3', '0', '0', '5', '1', '7'],
 ['AlaskaAirlines', '965346773', '5', '0', '0', '5', '1', '88'],
 ['Alitalia', '698012498', '7', '2', '50', '4', '0', '0'],
 ['AllNipponAirways', '1841234177', '3', '1', '1', '7', '0', '0'],
 ['American', '5228357340', '21', '5', '101', '17', '3', '416'],
 ['AustrianAirlines', '358239823', '1', '0', '0', '1', '0', '0'],
 ['Avianca', '396922563', '5', '3', '323', '0', '0', '0'],
 ['BritishAirways', '3179760952', '4', '0', '0', '6', '0', '0'],
 ['CathayPacific', '2582459303', '0', '0', '0', '2'

## 5.Calculate and add average number of incidents per miles.

In [10]:
# add 'avail_seat_mile_per_year' column
header.append('avail_seat_mile_per_year')

In [11]:
header

['airline',
 'avail_seat_km_per_week',
 'incidents_85_99',
 'fatal_accidents_85_99',
 'fatalities_85_99',
 'incidents_00_14',
 'fatal_accidents_00_14',
 'fatalities_00_14',
 'avail_seat_mile_per_year']

In [12]:
# Convert km_to_mil: 1 kilometer is equal to 0.621371 miles.
# Convert week_to_year: There are 52 weeks in a year.
for row in data:
    col = row[1] # curent avail_seat_km_per_week value
    new_col= int(col) * 0.621371 * 52
    print(new_col)
    break

10368911187.040329


In [13]:
# define a lambda function that returns the convertion result as an integer.
convert_to_mile = lambda x: int(x) * 0.621371 * 52
# apply the function to the avail_seat_km_per_week col
miles_per_year = [convert_to_mile(col[1]) for col in data]
# inspect the first five rows
miles_per_year[0:5]

[10368911187.040329,
 38698339987.21486,
 12465814325.193216,
 19285699436.412395,
 60268760250.53219]

In [14]:
for index,row in enumerate(miles_per_year):
    data[index].append(row)
data[0]

['AerLingus', '320906734', '2', '0', '0', '0', '0', '0', 10368911187.040329]

In [15]:
# list comprehension for better performance for larger datasets

# data = [row + [miles_per_year[index]] for index, row in enumerate(data)]

In [16]:
# using csv writer
with open('../data/cleaned/airline_safety_clean.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header) # write header row first
    
    for row in data:
        writer.writerow(row) # iterate thru each row