## Preparation

import the modules and removed previous database (if any)

In [1]:
import os
import sqlite3
import pandas as pd

# Define your database file path
db_file = 'C:/Users/piorizzielloa/Documents/Data Science/airline2.db'

# Check if file exists then delete
if os.path.exists(db_file):
    os.remove(db_file)
else:
    print("The file does not exist")



## Create database 

In [2]:
# Note your path to the database may be different 
conn = sqlite3.connect('C:/Users/piorizzielloa/Documents/Data Science/airline2.db')

## Create tables 

First create the tables for the airports, carrier and plane-data dataset 

In [3]:
# Note your path to the data may be different 
airports = pd.read_csv("C:/Users/piorizzielloa/Documents/Data Science/airports.csv")
carriers = pd.read_csv("C:/Users/piorizzielloa/Documents/Data Science/carriers.csv")
planes = pd.read_csv("C:/Users/piorizzielloa/Documents/Data Science/plane-data.csv")

airports.to_sql('airports', con = conn, index = False)
carriers.to_sql('carriers', con = conn, index = False)
planes.to_sql('planes', con = conn, index = False)

5029

Then create the table for ontime, which requires several csv files

In [11]:
c = conn.cursor()

# Drop the table if it already exists
c.execute("DROP TABLE IF EXISTS ontime")

c.execute('''
CREATE TABLE ontime (
  Year int,
  Month int,
  DayofMonth int,
  DayOfWeek int,
  DepTime  int,
  CRSDepTime int,
  ArrTime int,
  CRSArrTime int,
  UniqueCarrier varchar(5),
  FlightNum int,
  TailNum varchar(8),
  ActualElapsedTime int,
  CRSElapsedTime int,
  AirTime int,
  ArrDelay int,
  DepDelay int,
  Origin varchar(3),
  Dest varchar(3),
  Distance int,
  TaxiIn int,
  TaxiOut int,
  Cancelled int,
  CancellationCode varchar(1),
  Diverted varchar(1),
  CarrierDelay int,
  WeatherDelay int,
  NASDelay int,
  SecurityDelay int,
  LateAircraftDelay int
)
''')
conn.commit()

for year in range(2006, 2009):
    ontime = pd.read_csv("C:/Users/piorizzielloa/Documents/Data Science/"+str(year)+".csv")
    ontime.to_sql('ontime', con = conn, if_exists = 'append', index = False)

conn.commit()

## Run queries 

Query 1: Find model that has the lowest associated average departure delay

In [12]:
c.execute('''
SELECT model AS model, AVG(ontime.DepDelay) AS avg_delay
FROM planes JOIN ontime USING(tailnum)
WHERE ontime.Cancelled = 0 AND ontime.Diverted = 0 AND ontime.DepDelay > 0
GROUP BY model
ORDER BY avg_delay
''')

result = c.fetchone()
if result is not None:
    print(result)
    print(result[0], "has the lowest associated average departure delay.")
else:
    print("No results returned from the query.")


('737-230', 12.956403269754768)
737-230 has the lowest associated average departure delay.


Query 2: Find the city that has the highest number of inbound flights (excluding canceled flights)

In [13]:
c.execute('''
SELECT airports.city AS city, COUNT(*) AS total
FROM airports JOIN ontime ON ontime.dest = airports.iata
WHERE ontime.Cancelled = 0
GROUP BY airports.city
ORDER BY total DESC
''')

print(c.fetchone()[0], "has the highest number of inbound flights (excluding canceled flights)")

Chicago has the highest number of inbound flights (excluding canceled flights)


Query 3: Find the carrier that has the highest number of canceled flights

In [14]:
c.execute('''
SELECT carriers.Description AS carrier, COUNT(*) AS total
FROM carriers JOIN ontime ON ontime.UniqueCarrier = carriers.Code
WHERE ontime.Cancelled = 1
AND carriers.Description IN ('United Air Lines Inc.', 'American Airlines Inc.', 'Pinnacle Airlines Inc.', 'Delta Air Lines Inc.')
GROUP BY carriers.Description
ORDER BY total DESC
''')

print(c.fetchone()[0],"has the highest number of canceled flights")

American Airlines Inc. has the highest number of canceled flights


Query 4: Find the carrier that has the highest number of canceled flights, relative to their number of total flights

In [15]:
c.execute('''
SELECT
q1.carrier AS carrier, (CAST(q1.numerator AS FLOAT)/ CAST(q2.denominator AS FLOAT)) AS ratio
FROM
(
  SELECT carriers.Description AS carrier, COUNT(*) AS numerator
  FROM carriers JOIN ontime ON ontime.UniqueCarrier = carriers.Code
  WHERE ontime.Cancelled = 1 AND carriers.Description IN ('United Air Lines Inc.', 'American Airlines Inc.', 'Pinnacle Airlines Inc.', 'Delta Air Lines Inc.')
  GROUP BY carriers.Description
) AS q1 JOIN
(
  SELECT carriers.Description AS carrier, COUNT(*) AS denominator
  FROM carriers JOIN ontime ON ontime.UniqueCarrier = carriers.Code
  WHERE carriers.Description IN ('United Air Lines Inc.', 'American Airlines Inc.', 'Pinnacle Airlines Inc.', 'Delta Air Lines Inc.')
  GROUP BY carriers.Description
) AS q2 USING(carrier)
ORDER BY ratio DESC
''')
print(c.fetchone()[0], "has the highest number of canceled flights, relative to their number of total flights")

Pinnacle Airlines Inc. has the highest number of canceled flights, relative to their number of total flights


Remember to close the connection

In [19]:
conn.close()