In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import sqlite3

## Store CSV into DataFrame, Extract & Transform
*We have four csv file:*
* Airline - has information about airline_id(IATA Code) and airline name;
* Airport - has information about airport_id, airport,	CITY	STATE	COUNTRY	LATITUDE	LONGITUDE

In [2]:
airline_file = "./Resources/airlines.csv"
airline_df = pd.read_csv(airline_file)
airline_df

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways
5,OO,Skywest Airlines Inc.
6,AS,Alaska Airlines Inc.
7,NK,Spirit Air Lines
8,WN,Southwest Airlines Co.
9,DL,Delta Air Lines Inc.


In [5]:
airline_df.columns = ['AIRLINE_ID','AIRLINE']
airline_df.set_index('AIRLINE_ID',inplace=True)
airline_df

Unnamed: 0_level_0,AIRLINE
AIRLINE_ID,Unnamed: 1_level_1
UA,United Air Lines Inc.
AA,American Airlines Inc.
US,US Airways Inc.
F9,Frontier Airlines Inc.
B6,JetBlue Airways
OO,Skywest Airlines Inc.
AS,Alaska Airlines Inc.
NK,Spirit Air Lines
WN,Southwest Airlines Co.
DL,Delta Air Lines Inc.


In [None]:
# Connect to sqlite

In [6]:
engine = create_engine('sqlite://', echo=False)

airline_df.to_sql('airline',con=engine)
engine.execute("select * from airline").fetchall()

[('UA', 'United Air Lines Inc.'),
 ('AA', 'American Airlines Inc.'),
 ('US', 'US Airways Inc.'),
 ('F9', 'Frontier Airlines Inc.'),
 ('B6', 'JetBlue Airways'),
 ('OO', 'Skywest Airlines Inc.'),
 ('AS', 'Alaska Airlines Inc.'),
 ('NK', 'Spirit Air Lines'),
 ('WN', 'Southwest Airlines Co.'),
 ('DL', 'Delta Air Lines Inc.'),
 ('EV', 'Atlantic Southeast Airlines'),
 ('HA', 'Hawaiian Airlines Inc.'),
 ('MQ', 'American Eagle Airlines Inc.'),
 ('VX', 'Virgin America')]

# Airport

In [10]:
airports_file = "./Resources/airports.csv"
airports_df = pd.read_csv(airports_file)
airports_df.columns =['AIRPORT_ID','AIRPORT','CITY','STATE','COUNTRY','LATITUDE','LONGITUDE']
airports_df.set_index('AIRPORT_ID',inplace=True)
airports_df.head()

Unnamed: 0_level_0,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
AIRPORT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [11]:
airports_df.to_sql('airports',con=engine)
engine.execute("select * from airports").fetchall()

[('ABE', 'Lehigh Valley International Airport', 'Allentown', 'PA', 'USA', 40.652359999999994, -75.4404),
 ('ABI', 'Abilene Regional Airport', 'Abilene', 'TX', 'USA', 32.41132, -99.6819),
 ('ABQ', 'Albuquerque International Sunport', 'Albuquerque', 'NM', 'USA', 35.04022, -106.60918999999998),
 ('ABR', 'Aberdeen Regional Airport', 'Aberdeen', 'SD', 'USA', 45.449059999999996, -98.42183),
 ('ABY', 'Southwest Georgia Regional Airport', 'Albany', 'GA', 'USA', 31.53552, -84.19447),
 ('ACK', 'Nantucket Memorial Airport', 'Nantucket', 'MA', 'USA', 41.25305, -70.06018),
 ('ACT', 'Waco Regional Airport', 'Waco', 'TX', 'USA', 31.611290000000004, -97.23052),
 ('ACV', 'Arcata Airport', 'Arcata/Eureka', 'CA', 'USA', 40.978120000000004, -124.10862),
 ('ACY', 'Atlantic City International Airport', 'Atlantic City', 'NJ', 'USA', 39.45758, -74.57717),
 ('ADK', 'Adak Airport', 'Adak', 'AK', 'USA', 51.877959999999995, -176.64603),
 ('ADQ', 'Kodiak Airport', 'Kodiak', 'AK', 'USA', 57.74997, -152.49386),
 ('A

# Fllight

In [None]:
flights_file = "./Resources/flights.csv"
flights_df = pd.read_csv(flights_file)
flights_df.head()

#### Check Datatypes of all the columns

In [None]:
flights_df.dtypes

In [None]:
# DROP DELAY columns
flights_df = flights_df.drop(columns=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY','AIRLINE_DELAY',
                         'LATE_AIRCRAFT_DELAY','WEATHER_DELAY'])
flights_df

In [None]:
# Merge DateTime to one column
flights_df['DATE'] = pd.to_datetime(flights_df[['YEAR','MONTH','DAY']])
flights_df = flights_df.drop(columns=['YEAR','MONTH','DAY'])
flights_df.head()

In [None]:
# Saturday has the least flights
# Thursday has the most flights
flights_df.groupby('DAY_OF_WEEK').count()

In [None]:
# Change Day of week from num to string
flights_df.loc[flights_df["DAY_OF_WEEK"] == 1, "DAY_OF_WEEK"] = "Monday"
flights_df.loc[flights_df["DAY_OF_WEEK"] == 2, "DAY_OF_WEEK"] = "Tuesday"
flights_df.loc[flights_df["DAY_OF_WEEK"] == 3, "DAY_OF_WEEK"] = "Wednesday"
flights_df.loc[flights_df["DAY_OF_WEEK"] == 4, "DAY_OF_WEEK"] = "Thursday"
flights_df.loc[flights_df["DAY_OF_WEEK"] == 5, "DAY_OF_WEEK"] = "Friday"
flights_df.loc[flights_df["DAY_OF_WEEK"] == 6, "DAY_OF_WEEK"] = "Saturday"
flights_df.loc[flights_df["DAY_OF_WEEK"] == 7, "DAY_OF_WEEK"] = "Sunday"

In [None]:
flights_df.columns

In [None]:
flights_df

### Delay Flights

In [None]:
# The total df of delay flights
Delay_flights = flights_df.loc[flights_df['ARRIVAL_DELAY']!= 0].reset_index(drop=True)
Delay_flights

# Connect to sql

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite://', echo=False)

In [None]:
Delay_flights.to_sql('delayFlights',con=engine)
engine.execute("select * from delayFlights").fetchall()

### CANCELLED FLIGHT

In [None]:
cancelled_flights = flights_df.loc[flights_df['CANCELLED']==1].reset_index(drop=True)

In [None]:
# Count total number of cancelled flights
cancelled_flights.count()['CANCELLED']

In [None]:
cancelled_flights.head(5)

In [None]:
cancelled_flights.dtypes

In [None]:
# Remove the meaningless columns
cancelled_flights = cancelled_flights[['AIRLINE','DATE','DAY_OF_WEEK','TAIL_NUMBER','FLIGHT_NUMBER','ORIGIN_AIRPORT',
                                     'DESTINATION_AIRPORT']]
cancelled_flights.head(10)

In [None]:
cancelled_flights.groupby('AIRLINE').count().sort_values(by='DATE', ascending=False)#['FLIGHT_NUMBER']

## Tweets data

In [None]:
tweets_file = "./Resources/Tweets.csv"
tweets_df = pd.read_csv(tweets_file)
tweets_df.head(10)
tweets_df.dtypes

In [None]:
tweets_df = tweets_df[['tweet_id','airline_sentiment','negativereason','airline','tweet_created']]
tweets_df

In [None]:
tweets_df['DATE'] = pd.to_datetime(tweets_df['tweet_created'])
tweets_df['DATE']=pd.DatetimeIndex(tweets_df['DATE']).date
tweets_df = tweets_df.drop(columns=['tweet_created'])

In [None]:
tweets_df

In [None]:
# Clean the tweet_created column to get the Date
# tweets_df['DATE'] = tweets_df['tweet_created'].str[0:10]
# tweets_df['DATE'] = pd.to_datetime(tweets_df['DATE'])
# tweets_df.head()
# tweets_df = tweets_df.drop(columns=['tweet_created'])
# tweets_df.dtypes

In [None]:
# How many NaN in each column?
tweets_df.isnull().sum()

In [None]:
# If we want to change NaN valule to 0
# tweets_df.loc[tweets_df['set_of_numbers'].isnull(),'value_is_NaN'] = 'Yes'
# tweets_df.loc[tweets_df['set_of_numbers'].notnull(),'value_is_NaN'] = 'No'

# count_nan = tweets_df.loc[tweets_df['value_is_NaN']=='Yes'].count()
# print (count_nan)

#### How many airlines are mentioned in tweets data?

In [None]:
tweets_df.groupby('airline').count()

### Add AIRLINE_ID to tweets_df

In [None]:
tweets_df.loc[tweets_df["airline"] == "American", "AIRLINE_ID"] = "AA"
tweets_df.loc[tweets_df["airline"] == "Delta",    "AIRLINE_ID"] = "DL"
tweets_df.loc[tweets_df["airline"] == "Southwest","AIRLINE_ID"] = "WN"
tweets_df.loc[tweets_df["airline"] == "US Airways","AIRLINE_ID"] = "US"
tweets_df.loc[tweets_df["airline"] == "United", "AIRLINE_ID"] = "UA"
tweets_df.loc[tweets_df["airline"] == "Virgin America", "AIRLINE_ID"] = "VX"

In [None]:
tweets_df.groupby('AIRLINE_ID').count()