In [1]:
import pandas as pd
import numpy as np
import scipy
import random
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import linear_model
import statsmodels.formula.api as smf
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.utils import resample

I will make a model that can predict whether or not a plane will arrive on time. 

In [2]:
flights07 = pd.read_csv("flights07.csv", nrows=10000)
flights08 = pd.read_csv("flights08.csv", nrows=10000)

In [3]:
flights07.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2007,1,1,1,1232.0,1225,1341.0,1340,WN,2891,...,4,11,0,,0,0,0,0,0,0
1,2007,1,1,1,1918.0,1905,2043.0,2035,WN,462,...,5,6,0,,0,0,0,0,0,0
2,2007,1,1,1,2206.0,2130,2334.0,2300,WN,1229,...,6,9,0,,0,3,0,0,0,31
3,2007,1,1,1,1230.0,1200,1356.0,1330,WN,1355,...,3,8,0,,0,23,0,0,0,3
4,2007,1,1,1,831.0,830,957.0,1000,WN,2278,...,3,9,0,,0,0,0,0,0,0


In [4]:
# delete column of null values
del flights07['CancellationCode']

In [5]:
# drop nulls
flights07 = flights07.dropna()
flights07 = flights07.reset_index(drop=True)

In [6]:
#Make departure and arrival time columns strings so they can be converted to datetime variables
flights07['DepTime'] = flights07['DepTime'].apply(int).apply(str)
flights07['CRSDepTime'] = flights07['CRSDepTime'].apply(str)
flights07['ArrTime'] = flights07['ArrTime'].apply(int).apply(str)
flights07['CRSArrTime'] = flights07['CRSArrTime'].apply(str)

In [7]:
flights07.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2007,1,1,1,1232,1225,1341,1340,WN,2891,...,389,4,11,0,0,0,0,0,0,0
1,2007,1,1,1,1918,1905,2043,2035,WN,462,...,479,5,6,0,0,0,0,0,0,0
2,2007,1,1,1,2206,2130,2334,2300,WN,1229,...,479,6,9,0,0,3,0,0,0,31
3,2007,1,1,1,1230,1200,1356,1330,WN,1355,...,479,3,8,0,0,23,0,0,0,3
4,2007,1,1,1,831,830,957,1000,WN,2278,...,479,3,9,0,0,0,0,0,0,0


In [8]:
import datetime

In [9]:
features = pd.DataFrame()

In [10]:
deptimes = []
depproblem_index = []
for i in range(0,len(flights07['DepTime'])):
    if len(flights07['DepTime'][i])==4:
        hour = int(flights07['DepTime'][i][0:2])
        minute = int(flights07['DepTime'][i][2:4])
        a = datetime.timedelta(hours=hour,minutes=minute)
    elif len(flights07['DepTime'][i])==3:
        hour = int(flights07['DepTime'][i][0:1])
        minute = int(flights07['DepTime'][i][1:3])
        a = datetime.timedelta(hours=hour,minutes=minute)
    else:
        depproblem_index.append(i)
    deptimes.append(a)

In [11]:
crsdeptimes = []
crsdepproblem_index = []
for i in range(0,len(flights07['CRSDepTime'])):
    if len(flights07['CRSDepTime'][i])==4:
        hour = int(flights07['CRSDepTime'][i][0:2])
        minute = int(flights07['CRSDepTime'][i][2:4])
        a = datetime.timedelta(hours=hour,minutes=minute)
    elif len(flights07['CRSDepTime'][i])==3:
        hour = int(flights07['CRSDepTime'][i][0:1])
        minute = int(flights07['CRSDepTime'][i][1:3])
        a = datetime.timedelta(hours=hour,minutes=minute)
    else:
        crsdepproblem_index.append(i)
    crsdeptimes.append(a)

In [12]:
arrtimes = []
arrproblem_index = []
for i in range(0,len(flights07['ArrTime'])):
    if len(flights07['ArrTime'][i])==4:
        hour = int(flights07['ArrTime'][i][0:2])
        minute = int(flights07['ArrTime'][i][2:4])
        a = datetime.timedelta(hours=hour,minutes=minute)
    elif len(flights07['ArrTime'][i])==3:
        hour = int(flights07['ArrTime'][i][0:1])
        minute = int(flights07['ArrTime'][i][1:3])
        a = datetime.timedelta(hours=hour,minutes=minute)
    else:
        arrproblem_index.append(i)
    arrtimes.append(a)

In [13]:
crsarrtimes = []
crsarrproblem_index = []
for i in range(0,len(flights07['CRSArrTime'])):
    if len(flights07['CRSArrTime'][i])==4:
        hour = int(flights07['CRSArrTime'][i][0:2])
        minute = int(flights07['CRSArrTime'][i][2:4])
        a = datetime.datetime.min + datetime.timedelta(hours=hour,minutes=minute)
    elif len(flights07['CRSArrTime'][i])==3:
        hour = int(flights07['CRSArrTime'][i][0:1])
        minute = int(flights07['CRSArrTime'][i][1:3])
        a = datetime.datetime.min + datetime.timedelta(hours=hour,minutes=minute)
    else:
        crsarrproblem_index.append(i)
    crsarrtimes.append(a)

In [14]:
problem_index = np.concatenate((arrproblem_index,crsarrproblem_index))

In [15]:
#Remove Duplicates by Converting from set to list
problem_index = set(problem_index)
problem_index = list(problem_index)
problem_index.sort()

In [16]:
flights07 = flights07.drop(flights07.index[problem_index])
flights07 = flights07.reset_index(drop=True)

In [17]:
#list of actual departure times
deptimes = []

for i in range(0,len(flights07['DepTime'])):
    if len(flights07['DepTime'][i])==4:
        hour = int(flights07['DepTime'][i][0:2])
        minute = int(flights07['DepTime'][i][2:4])
        a = datetime.timedelta(hours=hour,minutes=minute)
    else:
        hour = int(flights07['DepTime'][i][0:1])
        minute = int(flights07['DepTime'][i][1:3])
        a = datetime.timedelta(hours=hour,minutes=minute)
    deptimes.append(a)

#list of scheduled departure times
crsdeptimes = []

for i in range(0,len(flights07['CRSDepTime'])):
    if len(flights07['CRSDepTime'][i])==4:
        hour = int(flights07['CRSDepTime'][i][0:2])
        minute = int(flights07['CRSDepTime'][i][2:4])
        a = datetime.timedelta(hours=hour,minutes=minute)
    else:
        hour = int(flights07['CRSDepTime'][i][0:1])
        minute = int(flights07['CRSDepTime'][i][1:3])
        a = datetime.timedelta(hours=hour,minutes=minute)
    crsdeptimes.append(a)

#list of actual arrival times
arrtimes = []

for i in range(0,len(flights07['ArrTime'])):
    if len(flights07['ArrTime'][i])==4:
        hour = int(flights07['ArrTime'][i][0:2])
        minute = int(flights07['ArrTime'][i][2:4])
        a =  datetime.timedelta(hours=hour,minutes=minute)
    else:
        hour = int(flights07['ArrTime'][i][0:1])
        minute = int(flights07['ArrTime'][i][1:3])
        a = datetime.timedelta(hours=hour,minutes=minute)
    arrtimes.append(a)

#list of scheduled arrival times
crsarrtimes = []

for i in range(0,len(flights07['CRSArrTime'])):
    if len(flights07['CRSArrTime'][i])==4:
        hour = int(flights07['CRSArrTime'][i][0:2])
        minute = int(flights07['CRSArrTime'][i][2:4])
        a =  datetime.timedelta(hours=hour,minutes=minute)
    else:
        hour = int(flights07['CRSArrTime'][i][0:1])
        minute = int(flights07['CRSArrTime'][i][1:3])
        a = datetime.timedelta(hours=hour,minutes=minute)
    crsarrtimes.append(a)

In [18]:
features['deptimes'] = deptimes
features['crsdeptimes'] = crsdeptimes
features['arrtimes'] = arrtimes
features['crsarrtimes'] = crsarrtimes

In [28]:
features['arrtimediff'] = features['arrtimes'] - features['crsarrtimes'] 
midnight_times = []
for i in range(len(features['arrtimes'])):
    if features['arrtimediff'][i] < datetime.timedelta(minutes=0):
        midnight_times.append(i)
for i in midnight_times:
    features['arrtimediff'][i] = features['crsarrtimes'][i] - features['arrtimes'][i] 
features['arrtimediff'].head()

0   00:01:00
1   00:08:00
2   00:34:00
3   00:26:00
4   00:03:00
Name: arrtimediff, dtype: timedelta64[ns]

In [29]:
features['late'] = np.where(features['arrtimediff']>=datetime.timedelta(minutes=30),1,0)

In [None]:
late = datetime.datetime.min + datetime.timedelta(hours=0,minutes=30)
np.where((features['arrtimes'] - features['crsarrtimes'])>=late, 1,0)