In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

#Input: Determine data.
data_no= 1 #0: Safair on El Paso, 1: Safair on Houston, 2: Delta on El Paso, 3:Delta on Houston
delay_type = 0 #0: Departure delay, 1 Airtime delay, 2: Arrival delay

# Pre-coded from the raw data.
name_of_files = ['FA_Elpaso','FA_IAH','ElPaso_Delta','IAH_Delta']
nrows_data = [458,2609,1837,11273]
Destination=[['DEN','ORD','LAS'],['DEN','RDU','AUS','MCO','LAS','PHL','HRL'],['ATL'],['ATL','SLC','DTW','LGA','MSP','JFK','LAX']]
Delays = ['Dep_Delay15','Air_Delay15','Arv_Delay15']


# Read the data
data=pd.read_csv('./'+name_of_files[data_no]+'.csv', header=7, nrows=nrows_data[data_no])
# Delete unused column
dropname = ['Unnamed: '+str(idx)  for idx in range(17,30)]
data.drop(dropname, axis=1, inplace=True)

# Add new variables from the raw data.
# Departure Delay: Actual departure time - Scheduled departure time.
data['Dep_Delay'] = data['Actual departure time']-data['Scheduled departure time']
# Airtime Delay: Actual elpased time - Scheduled elapsed time.
data['Airtime_Delay'] = data['Actual elapsed time (Minutes)']-data['Scheduled elapsed time (Minutes)']
# Arrival Delay: Taxi out time
data['Arv_Delay'] = data['Taxi-Out time (Minutes)']


# Hot coding for logistic regression
# If delays are greater than 15 minutes, give 1. Otherwise, give 0.
data.loc[(data['Dep_Delay'] >= 15), 'Dep_Delay15'] = 1
data['Dep_Delay15'] = data['Dep_Delay15'].fillna(0)
data.loc[(data['Airtime_Delay'] >= 15), 'Air_Delay15'] = 1
data['Air_Delay15'] = data['Air_Delay15'].fillna(0)
data.loc[(data['Arv_Delay'] >= 15), 'Arv_Delay15'] = 1
data['Arv_Delay15'] = data['Arv_Delay15'].fillna(0)

# Make a day of week for each day.
data['Weekday']=pd.to_datetime(data['Date (MM/DD/YYYY)']).dt.dayofweek

# Hot coding for 24 hours
#( 03am-09am,9am-3pm,3pm-9pm,9pm-03am as zone 1,2,3,4), 
data.loc[((data['Scheduled departure time'] >= 300) & (data['Scheduled departure time'] < 900)), 'Dep_Timezone'] = 1
data.loc[((data['Scheduled departure time'] >= 900) & (data['Scheduled departure time'] < 1500)), 'Dep_Timezone'] = 2
data.loc[((data['Scheduled departure time'] >= 1500)& (data['Scheduled departure time'] < 2100)), 'Dep_Timezone'] = 3
data['Dep_Timezone'] = data['Dep_Timezone'].fillna(4)

# Hot coding for destination
for item in Destination[data_no]:
    data.loc[(data['Destination Airport'] == item), 'Dest_'+item] = 1
    data['Dest_'+item] = data['Dest_'+item].fillna(0)

# Second thing: Prediction model (Logistic Regression)
# x : 
#(1) The day of week, 
#(2) Scheduled departure time( Classify it as 03am-09am,9am-3pm,3pm-9pm,9pm-03am as 1,2,3,4), 
#(3) Destinations (hot coded as 0,1 for each destination)
# y: delayed or not (15 minutes or more delay = 1, otherwise 0)


#Logistic regression
X = data[['Weekday','Dep_Timezone']+['Dest_'+item for item in Destination[data_no]]].copy()
y = data[[Delays[delay_type]]].copy()
clf = LogisticRegression(random_state=0).fit(X, y)


# finalized results: Score of the logistic regression
clf.score(X,y)

# Implication
# For airline: suggest better schedule or time for cushion they can use in a busy time.
# For customer: suggest a prediction they can use when they reserve a flight.



  return f(*args, **kwargs)


0.8041395170563435