### Import a set of initial libraries 

In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('max_columns', None)

### Load the csv file and show the raw data

In [2]:
df = pd.read_csv("./data/flights_random_sample_400000.csv")

In [3]:
df

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-03-07,AA,AA,AA,311,AA,N906NN,311,13303,MIA,"Miami, FL",10529,BDL,"Hartford, CT",1940,,,,,,,2249,,,1.0,B,0.0,N,189.0,,,1.0,1194.0,,,,,,,,,
1,2018-03-07,AA,AA,AA,332,AA,N946NN,332,12953,LGA,"New York, NY",13930,ORD,"Chicago, IL",1330,,,,,,,1512,,,1.0,B,0.0,N,162.0,,,1.0,733.0,,,,,,,,,
2,2018-03-07,AA,AA,AA,465,AA,N200UU,465,14107,PHX,"Phoenix, AZ",14679,SAN,"San Diego, CA",835,833.0,-2.0,13.0,846.0,838.0,2.0,851,840.0,-11.0,0.0,,0.0,N,76.0,67.0,52.0,1.0,304.0,,,,,,,,,
3,2018-03-07,AA,AA,AA,550,AA,N947UW,550,14100,PHL,"Philadelphia, PA",10529,BDL,"Hartford, CT",2050,,,,,,,2156,,,1.0,B,0.0,N,66.0,,,1.0,196.0,,,,,,,,,
4,2018-03-07,AA,AA,AA,591,AA,N833AW,591,11057,CLT,"Charlotte, NC",11278,DCA,"Washington, DC",1431,1537.0,66.0,16.0,1553.0,1648.0,3.0,1559,1651.0,52.0,0.0,,0.0,N,88.0,74.0,55.0,1.0,331.0,52.0,0.0,0.0,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397919,2018-03-07,UA,UA_CODESHARE,UA,6215,YV,N88335,6215,14570,RNO,"Reno, NV",12266,IAH,"Houston, TX",1245,1256.0,11.0,13.0,1309.0,1817.0,16.0,1825,1833.0,8.0,0.0,,0.0,N,220.0,217.0,188.0,1.0,1530.0,,,,,,,,,
397920,2018-03-07,AA,AA_CODESHARE,AA,4441,YX,,4441,14100,PHL,"Philadelphia, PA",11278,DCA,"Washington, DC",2110,,,,,,,2218,,,1.0,B,0.0,N,68.0,,,1.0,119.0,,,,,,,,,
397921,2018-03-07,AA,AA_CODESHARE,AA,4640,YX,N417YX,4640,13303,MIA,"Miami, FL",10397,ATL,"Atlanta, GA",1607,1555.0,-12.0,16.0,1611.0,1744.0,11.0,1815,1755.0,-20.0,0.0,,0.0,N,128.0,120.0,93.0,1.0,594.0,,,,,,,,,
397922,2018-03-07,AA,AA_CODESHARE,AA,4734,YX,N433YX,4734,14122,PIT,"Pittsburgh, PA",13303,MIA,"Miami, FL",1900,1855.0,-5.0,12.0,1907.0,2128.0,9.0,2154,2137.0,-17.0,0.0,,0.0,N,174.0,162.0,141.0,1.0,1013.0,,,,,,,,,


### Create the columns 'year', 'month', 'day' separating the 'fl_date' column and after that conert it back to a datetime format

In [4]:
df[["year", "month" ,"day"]] = df["fl_date"].str.split("-", expand = True)

In [5]:
df1 = df.pop('year')
df2 = df.pop('month')
df3 = df.pop('day')
df.insert(1, "year", df1)
df.insert(2, "month", df2)
df.insert(3, "day", df3)

In [6]:
df['date'] = pd.to_datetime(df[['year','month', 'day']])
df.drop(['fl_date'], axis=1)

Unnamed: 0,year,month,day,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,date
0,2018,03,07,AA,AA,AA,311,AA,N906NN,311,13303,MIA,"Miami, FL",10529,BDL,"Hartford, CT",1940,,,,,,,2249,,,1.0,B,0.0,N,189.0,,,1.0,1194.0,,,,,,,,,,2018-03-07
1,2018,03,07,AA,AA,AA,332,AA,N946NN,332,12953,LGA,"New York, NY",13930,ORD,"Chicago, IL",1330,,,,,,,1512,,,1.0,B,0.0,N,162.0,,,1.0,733.0,,,,,,,,,,2018-03-07
2,2018,03,07,AA,AA,AA,465,AA,N200UU,465,14107,PHX,"Phoenix, AZ",14679,SAN,"San Diego, CA",835,833.0,-2.0,13.0,846.0,838.0,2.0,851,840.0,-11.0,0.0,,0.0,N,76.0,67.0,52.0,1.0,304.0,,,,,,,,,,2018-03-07
3,2018,03,07,AA,AA,AA,550,AA,N947UW,550,14100,PHL,"Philadelphia, PA",10529,BDL,"Hartford, CT",2050,,,,,,,2156,,,1.0,B,0.0,N,66.0,,,1.0,196.0,,,,,,,,,,2018-03-07
4,2018,03,07,AA,AA,AA,591,AA,N833AW,591,11057,CLT,"Charlotte, NC",11278,DCA,"Washington, DC",1431,1537.0,66.0,16.0,1553.0,1648.0,3.0,1559,1651.0,52.0,0.0,,0.0,N,88.0,74.0,55.0,1.0,331.0,52.0,0.0,0.0,0.0,0.0,,,,,2018-03-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397919,2018,03,07,UA,UA_CODESHARE,UA,6215,YV,N88335,6215,14570,RNO,"Reno, NV",12266,IAH,"Houston, TX",1245,1256.0,11.0,13.0,1309.0,1817.0,16.0,1825,1833.0,8.0,0.0,,0.0,N,220.0,217.0,188.0,1.0,1530.0,,,,,,,,,,2018-03-07
397920,2018,03,07,AA,AA_CODESHARE,AA,4441,YX,,4441,14100,PHL,"Philadelphia, PA",11278,DCA,"Washington, DC",2110,,,,,,,2218,,,1.0,B,0.0,N,68.0,,,1.0,119.0,,,,,,,,,,2018-03-07
397921,2018,03,07,AA,AA_CODESHARE,AA,4640,YX,N417YX,4640,13303,MIA,"Miami, FL",10397,ATL,"Atlanta, GA",1607,1555.0,-12.0,16.0,1611.0,1744.0,11.0,1815,1755.0,-20.0,0.0,,0.0,N,128.0,120.0,93.0,1.0,594.0,,,,,,,,,,2018-03-07
397922,2018,03,07,AA,AA_CODESHARE,AA,4734,YX,N433YX,4734,14122,PIT,"Pittsburgh, PA",13303,MIA,"Miami, FL",1900,1855.0,-5.0,12.0,1907.0,2128.0,9.0,2154,2137.0,-17.0,0.0,,0.0,N,174.0,162.0,141.0,1.0,1013.0,,,,,,,,,,2018-03-07


### Define the function "time_format" and convert military time to datetime.time

In [7]:
import datetime

# Function that convert the 'HHMM' string to datetime.time "1940 -> 19:40:00"
def time_format(chain):
    if pd.isnull(chain):
        return np.nan
    else:
        if chain == 2400: chain = 0
        chain = "{0:04d}".format(int(chain))
        time = datetime.time(int(chain[0:2]), int(chain[2:4]))
        return time

### Convert "crs_dep_time", "dep_time", "crs_arr_time" and "arr_time" from military time to datetime.time

In [8]:
df['crs_dep_time'] = df['crs_dep_time'].apply(time_format)

In [9]:
df['dep_time'] = df['dep_time'].apply(time_format)

In [10]:
df['crs_arr_time'] = df['crs_arr_time'].apply(time_format)

In [11]:
df['arr_time'] = df['arr_time'].apply(time_format)

### Remove some variables that wont be useful for further analysis

In [12]:
variables_to_remove = ['taxi_out',
                       'taxi_in',
                       'wheels_on',
                       'wheels_off',
                       'date',
                       'nas_delay',
                       'security_delay',
                       'carrier_delay',
                       'late_aircraft_delay',
                       'weather_delay',
                       'diverted',
                       'cancelled',
                       'cancellation_code',
                       'mkt_carrier_fl_num',
                       'tail_num',
                       'mkt_unique_carrier',
                       'branded_code_share',
                       'mkt_carrier',
                       'op_carrier_fl_num',
                       'dup',
                       'flights',
                       'first_dep_time',
                       'total_add_gtime',
                       'longest_add_gtime',
                       'no_name'
                      ]

In [13]:
df.drop(variables_to_remove, axis = 1, inplace = True)

### Rearrange the columns on the dataframe and show it

In [14]:
df = df[['year',
         'month',
         'day',
         'op_unique_carrier',
         'origin_city_name',
         'origin',
         'origin_airport_id',
         'dest_city_name',
         'dest',
         'dest_airport_id',
         'distance',
         'air_time',
         'crs_dep_time',
         'dep_time',
         'dep_delay',
         'crs_arr_time',
         'arr_time',
         'arr_delay',
         'crs_elapsed_time',
         'actual_elapsed_time'
        ]]

In [15]:
df

Unnamed: 0,year,month,day,op_unique_carrier,origin_city_name,origin,origin_airport_id,dest_city_name,dest,dest_airport_id,distance,air_time,crs_dep_time,dep_time,dep_delay,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time
0,2018,03,07,AA,"Miami, FL",MIA,13303,"Hartford, CT",BDL,10529,1194.0,,19:40:00,,,22:49:00,,,189.0,
1,2018,03,07,AA,"New York, NY",LGA,12953,"Chicago, IL",ORD,13930,733.0,,13:30:00,,,15:12:00,,,162.0,
2,2018,03,07,AA,"Phoenix, AZ",PHX,14107,"San Diego, CA",SAN,14679,304.0,52.0,08:35:00,08:33:00,-2.0,08:51:00,08:40:00,-11.0,76.0,67.0
3,2018,03,07,AA,"Philadelphia, PA",PHL,14100,"Hartford, CT",BDL,10529,196.0,,20:50:00,,,21:56:00,,,66.0,
4,2018,03,07,AA,"Charlotte, NC",CLT,11057,"Washington, DC",DCA,11278,331.0,55.0,14:31:00,15:37:00,66.0,15:59:00,16:51:00,52.0,88.0,74.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397919,2018,03,07,YV,"Reno, NV",RNO,14570,"Houston, TX",IAH,12266,1530.0,188.0,12:45:00,12:56:00,11.0,18:25:00,18:33:00,8.0,220.0,217.0
397920,2018,03,07,YX,"Philadelphia, PA",PHL,14100,"Washington, DC",DCA,11278,119.0,,21:10:00,,,22:18:00,,,68.0,
397921,2018,03,07,YX,"Miami, FL",MIA,13303,"Atlanta, GA",ATL,10397,594.0,93.0,16:07:00,15:55:00,-12.0,18:15:00,17:55:00,-20.0,128.0,120.0
397922,2018,03,07,YX,"Pittsburgh, PA",PIT,14122,"Miami, FL",MIA,13303,1013.0,141.0,19:00:00,18:55:00,-5.0,21:54:00,21:37:00,-17.0,174.0,162.0


### Check for missing values, drop all the missing values and check again to confirm

In [16]:
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

Unnamed: 0,variable,missing values,filling factor (%)
0,air_time,7864,98.023743
1,arr_delay,7763,98.049125
2,actual_elapsed_time,7707,98.063198
3,arr_time,6825,98.284848
4,dep_delay,6538,98.356973
5,dep_time,6420,98.386627
6,crs_arr_time,0,100.0
7,crs_dep_time,0,100.0
8,distance,0,100.0
9,year,0,100.0


In [17]:
df.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


In [18]:
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

Unnamed: 0,variable,missing values,filling factor (%)
0,year,0,100.0
1,arr_delay,0,100.0
2,arr_time,0,100.0
3,crs_arr_time,0,100.0
4,dep_delay,0,100.0
5,dep_time,0,100.0
6,crs_dep_time,0,100.0
7,air_time,0,100.0
8,distance,0,100.0
9,dest_airport_id,0,100.0


### Use lambda to convert "crs_dep_time" and "dep_time" to minutes

In [19]:
# convert from hour to minutes
fct = lambda x:x.hour*60+x.minute

In [20]:
df['dep_time_min'] = df['dep_time'].apply(fct)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dep_time_min'] = df['dep_time'].apply(fct)


In [21]:
df['crs_dep_time_min'] = df['crs_dep_time'].apply(fct)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['crs_dep_time_min'] = df['crs_dep_time'].apply(fct)


In [22]:
df

Unnamed: 0,year,month,day,op_unique_carrier,origin_city_name,origin,origin_airport_id,dest_city_name,dest,dest_airport_id,distance,air_time,crs_dep_time,dep_time,dep_delay,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,dep_time_min,crs_dep_time_min
2,2018,03,07,AA,"Phoenix, AZ",PHX,14107,"San Diego, CA",SAN,14679,304.0,52.0,08:35:00,08:33:00,-2.0,08:51:00,08:40:00,-11.0,76.0,67.0,513,515
4,2018,03,07,AA,"Charlotte, NC",CLT,11057,"Washington, DC",DCA,11278,331.0,55.0,14:31:00,15:37:00,66.0,15:59:00,16:51:00,52.0,88.0,74.0,937,871
5,2018,03,07,AA,"Fort Lauderdale, FL",FLL,11697,"Charlotte, NC",CLT,11057,632.0,91.0,06:03:00,05:57:00,-6.0,08:09:00,08:05:00,-4.0,126.0,128.0,357,363
6,2018,03,07,AA,"Charlotte, NC",CLT,11057,"Boston, MA",BOS,10721,728.0,92.0,11:35:00,11:29:00,-6.0,13:52:00,13:24:00,-28.0,137.0,115.0,689,695
7,2018,03,07,AA,"Charlotte, NC",CLT,11057,"Tulsa, OK",TUL,15370,842.0,133.0,18:20:00,18:12:00,-8.0,20:02:00,19:42:00,-20.0,162.0,150.0,1092,1100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397918,2018,03,07,YV,"Cincinnati, OH",CVG,11193,"Washington, DC",IAD,12264,388.0,61.0,14:50:00,14:45:00,-5.0,16:17:00,16:08:00,-9.0,87.0,83.0,885,890
397919,2018,03,07,YV,"Reno, NV",RNO,14570,"Houston, TX",IAH,12266,1530.0,188.0,12:45:00,12:56:00,11.0,18:25:00,18:33:00,8.0,220.0,217.0,776,765
397921,2018,03,07,YX,"Miami, FL",MIA,13303,"Atlanta, GA",ATL,10397,594.0,93.0,16:07:00,15:55:00,-12.0,18:15:00,17:55:00,-20.0,128.0,120.0,955,967
397922,2018,03,07,YX,"Pittsburgh, PA",PIT,14122,"Miami, FL",MIA,13303,1013.0,141.0,19:00:00,18:55:00,-5.0,21:54:00,21:37:00,-17.0,174.0,162.0,1135,1140


### Define a function to get statistical parameters in order to obtain the mean delay for each carrier, origin airport, destination airport.

In [23]:
# function that extract statistical parameters from a grouby objet:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}


# Creation of a dataframe with statitical info on each airline:
global_stats_carrier = df['arr_delay'].groupby(df['op_unique_carrier']).apply(get_stats).unstack()
global_stats_carrier = global_stats_carrier.sort_values('count')
global_stats_carrier.reset_index(inplace=True)
global_stats_carrier.head(5)

Unnamed: 0,op_unique_carrier,min,max,count,mean
0,9K,-24.0,29.0,45.0,-4.555556
1,KS,-38.0,254.0,71.0,26.422535
2,EM,-28.0,170.0,245.0,4.946939
3,VX,-54.0,202.0,430.0,2.774419
4,C5,-49.0,1317.0,2605.0,24.450288


In [24]:
# Creation of a dataframe with statitical info on each origin airpot:
global_stats_origin_airport = df['dep_delay'].groupby(df['origin']).apply(get_stats).unstack()
global_stats_origin_airport = global_stats_origin_airport.sort_values('count')
global_stats_origin_airport.reset_index(inplace=True)
global_stats_origin_airport.head(5)

Unnamed: 0,origin,min,max,count,mean
0,IFP,12.0,12.0,1.0,12.0
1,AKN,-6.0,-6.0,1.0,-6.0
2,HYA,-8.0,1.0,3.0,-2.333333
3,GST,4.0,27.0,3.0,18.333333
4,DLG,-13.0,0.0,3.0,-4.333333


In [25]:
# Creation of a dataframe with statitical info on each dest airport:
global_stats_dest_airport = df['arr_delay'].groupby(df['dest']).apply(get_stats).unstack()
global_stats_dest_airport = global_stats_dest_airport.sort_values('count')
global_stats_dest_airport.reset_index(inplace=True)
global_stats_dest_airport.head(5)

Unnamed: 0,dest,min,max,count,mean
0,HYA,-9.0,49.0,2.0,20.0
1,IFP,-17.0,-15.0,2.0,-16.0
2,ROP,-15.0,0.0,3.0,-8.333333
3,DLG,-6.0,86.0,3.0,35.333333
4,AKN,-6.0,13.0,3.0,3.666667


In [26]:
# Creation of a dataframe with statitical info for the mean elapsed time per destination airport:
global_stats_elapsed_time = df['actual_elapsed_time'].groupby(df['dest']).apply(get_stats).unstack()
global_stats_elapsed_time = global_stats_elapsed_time.sort_values('count')
global_stats_elapsed_time.reset_index(inplace=True)
global_stats_elapsed_time.head(5)

Unnamed: 0,dest,min,max,count,mean
0,HYA,69.0,70.0,2.0,69.5
1,IFP,65.0,67.0,2.0,66.0
2,ROP,40.0,40.0,3.0,40.0
3,DLG,67.0,75.0,3.0,70.0
4,AKN,60.0,79.0,3.0,71.666667


In [27]:
# Creation of a dataframe with statitical info for the mean departure time per original airport:
global_stats_mean_depTime_origin_airport = df['dep_time_min'].groupby(df['origin']).apply(get_stats).unstack()
global_stats_mean_depTime_origin_airport = global_stats_mean_depTime_origin_airport.sort_values('count')
global_stats_mean_depTime_origin_airport.reset_index(inplace=True)
global_stats_mean_depTime_origin_airport.head(5)

Unnamed: 0,origin,min,max,count,mean
0,IFP,777.0,777.0,1.0,777.0
1,AKN,1058.0,1058.0,1.0,1058.0
2,HYA,781.0,805.0,3.0,796.666667
3,GST,1095.0,1127.0,3.0,1108.666667
4,DLG,952.0,965.0,3.0,960.666667


### Merge the statistical information (just the mean) into the new main dataframe "dfx"

In [28]:
# Merge the mean delay obtained in global_stats_carrier into the new dfx

In [29]:
df2 = df.copy()

In [30]:
dfx = df2.merge(global_stats_carrier, how='left', left_on='op_unique_carrier', right_on='op_unique_carrier')

In [31]:
dfx = dfx.drop(['min', 'max', 'count'], axis=1)

In [32]:
dfx1 = dfx.pop('mean')

dfx.insert(0, "carrier_mean_delay", dfx1)

In [33]:
# Merge the mean delay obtained in global_stats_origin_airport into the new dfx

In [34]:
dfx2 = dfx.merge(global_stats_origin_airport, how='left', left_on='origin', right_on='origin')

In [35]:
dfx2 = dfx2.drop(['min', 'max', 'count'], axis=1)

In [36]:
dfx3 = dfx2.pop('mean')

dfx.insert(1, "origin_airport_mean_delay", dfx3)

In [37]:
# Merge the mean delay obtained in global_stats_dest_airport into the new dfx

In [38]:
dfx3 = dfx.merge(global_stats_dest_airport, how='left', left_on='dest', right_on='dest')

In [39]:
dfx3 = dfx3.drop(['min', 'max', 'count'], axis=1)

In [40]:
dfx4 = dfx3.pop('mean')

dfx.insert(2, "dest_airport_mean_delay", dfx4)

In [41]:
# Merge the mean delay obtained in global_stats_elapsed_time into the new dfx

In [42]:
dfx4 = dfx.merge(global_stats_elapsed_time, how='left', left_on='dest', right_on='dest')

In [43]:
dfx4 = dfx4.drop(['min', 'max', 'count'], axis=1)

In [44]:
dfx5 = dfx4.pop('mean')

dfx.insert(3, "dest_airport_mean_elapsed_time", dfx5)

In [45]:
# Merge the mean delay obtained in global_stats_mean_depTime_origin_airport into the new dfx

In [46]:
dfx5 = dfx.merge(global_stats_mean_depTime_origin_airport, how='left', left_on='origin', right_on='origin')

In [47]:
dfx5 = dfx5.drop(['min', 'max', 'count'], axis=1)

In [48]:
dfx6 = dfx5.pop('mean')

dfx.insert(4, "mean_depTime_origin_airport", dfx6)

In [49]:
#Show the dataframe

In [50]:
dfx

Unnamed: 0,carrier_mean_delay,origin_airport_mean_delay,dest_airport_mean_delay,dest_airport_mean_elapsed_time,mean_depTime_origin_airport,year,month,day,op_unique_carrier,origin_city_name,origin,origin_airport_id,dest_city_name,dest,dest_airport_id,distance,air_time,crs_dep_time,dep_time,dep_delay,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,dep_time_min,crs_dep_time_min
0,6.070661,9.056345,2.391295,157.418605,822.221222,2018,03,07,AA,"Phoenix, AZ",PHX,14107,"San Diego, CA",SAN,14679,304.0,52.0,08:35:00,08:33:00,-2.0,08:51:00,08:40:00,-11.0,76.0,67.0,513,515
1,6.070661,10.247151,4.504354,114.573305,890.616854,2018,03,07,AA,"Charlotte, NC",CLT,11057,"Washington, DC",DCA,11278,331.0,55.0,14:31:00,15:37:00,66.0,15:59:00,16:51:00,52.0,88.0,74.0,937,871
2,6.070661,11.598245,2.541735,111.305427,837.855336,2018,03,07,AA,"Fort Lauderdale, FL",FLL,11697,"Charlotte, NC",CLT,11057,632.0,91.0,06:03:00,05:57:00,-6.0,08:09:00,08:05:00,-4.0,126.0,128.0,357,363
3,6.070661,10.247151,9.687457,152.667443,890.616854,2018,03,07,AA,"Charlotte, NC",CLT,11057,"Boston, MA",BOS,10721,728.0,92.0,11:35:00,11:29:00,-6.0,13:52:00,13:24:00,-28.0,137.0,115.0,689,695
4,6.070661,10.247151,5.358862,99.721007,890.616854,2018,03,07,AA,"Charlotte, NC",CLT,11057,"Tulsa, OK",TUL,15370,842.0,133.0,18:20:00,18:12:00,-8.0,20:02:00,19:42:00,-20.0,162.0,150.0,1092,1100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389884,10.282094,11.886031,5.559465,124.886682,760.965968,2018,03,07,YV,"Cincinnati, OH",CVG,11193,"Washington, DC",IAD,12264,388.0,61.0,14:50:00,14:45:00,-5.0,16:17:00,16:08:00,-9.0,87.0,83.0,885,890
389885,10.282094,6.452031,6.592491,145.146171,769.350043,2018,03,07,YV,"Reno, NV",RNO,14570,"Houston, TX",IAH,12266,1530.0,188.0,12:45:00,12:56:00,11.0,18:25:00,18:33:00,8.0,220.0,217.0,776,765
389886,3.219832,11.162743,1.455644,116.709388,892.426743,2018,03,07,YX,"Miami, FL",MIA,13303,"Atlanta, GA",ATL,10397,594.0,93.0,16:07:00,15:55:00,-12.0,18:15:00,17:55:00,-20.0,128.0,120.0,955,967
389887,3.219832,7.501836,2.733816,159.693979,755.556100,2018,03,07,YX,"Pittsburgh, PA",PIT,14122,"Miami, FL",MIA,13303,1013.0,141.0,19:00:00,18:55:00,-5.0,21:54:00,21:37:00,-17.0,174.0,162.0,1135,1140


### Drop some variables that wont be used for further analysis and rearrange the columns order

In [51]:
from sklearn.preprocessing import StandardScaler

In [52]:
dfx = dfx.drop(['origin_city_name',
                'dest_city_name',
                'origin_airport_id',
                'dest_airport_id',
                'crs_dep_time',
                'dep_time',
                'crs_arr_time',
                'arr_time',
                'origin',
                'dest'
               ],axis=1)

In [53]:
dfx = dfx[['year',
           'month',
           'day','op_unique_carrier',
           'carrier_mean_delay',
           'origin_airport_mean_delay',
           'dest_airport_mean_delay',
           'crs_dep_time_min',
           'mean_depTime_origin_airport',
           'distance',
           'air_time',
           'crs_elapsed_time',
           'actual_elapsed_time',
           'dest_airport_mean_elapsed_time',
           'dep_delay',
           'arr_delay',
           'dep_time_min'
          ]]

In [54]:
dfx

Unnamed: 0,year,month,day,op_unique_carrier,carrier_mean_delay,origin_airport_mean_delay,dest_airport_mean_delay,crs_dep_time_min,mean_depTime_origin_airport,distance,air_time,crs_elapsed_time,actual_elapsed_time,dest_airport_mean_elapsed_time,dep_delay,arr_delay,dep_time_min
0,2018,03,07,AA,6.070661,9.056345,2.391295,515,822.221222,304.0,52.0,76.0,67.0,157.418605,-2.0,-11.0,513
1,2018,03,07,AA,6.070661,10.247151,4.504354,871,890.616854,331.0,55.0,88.0,74.0,114.573305,66.0,52.0,937
2,2018,03,07,AA,6.070661,11.598245,2.541735,363,837.855336,632.0,91.0,126.0,128.0,111.305427,-6.0,-4.0,357
3,2018,03,07,AA,6.070661,10.247151,9.687457,695,890.616854,728.0,92.0,137.0,115.0,152.667443,-6.0,-28.0,689
4,2018,03,07,AA,6.070661,10.247151,5.358862,1100,890.616854,842.0,133.0,162.0,150.0,99.721007,-8.0,-20.0,1092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389884,2018,03,07,YV,10.282094,11.886031,5.559465,890,760.965968,388.0,61.0,87.0,83.0,124.886682,-5.0,-9.0,885
389885,2018,03,07,YV,10.282094,6.452031,6.592491,765,769.350043,1530.0,188.0,220.0,217.0,145.146171,11.0,8.0,776
389886,2018,03,07,YX,3.219832,11.162743,1.455644,967,892.426743,594.0,93.0,128.0,120.0,116.709388,-12.0,-20.0,955
389887,2018,03,07,YX,3.219832,7.501836,2.733816,1140,755.556100,1013.0,141.0,174.0,162.0,159.693979,-5.0,-17.0,1135


### In orther to be able to use the categorical variable "op_unique_carrier" we transform it into dummy variables and include the new variables into the main data frame

In [55]:
# Create a new dfy so we have a "checkpoint" for running different iterations of the model.
dfy = dfx.copy()

In [56]:
dfz = pd.get_dummies(dfy, columns=['op_unique_carrier'], drop_first=True)

### Define the dfzx and dfzy data frames that will be used as "X" and "y" variables in the model

In [57]:
# We set the target variable to the data frame dfzy that is goint to represent our y
dfzy = dfz['arr_delay']

In [58]:
# We set the independent variables to the data frame dfzx that is goint to represent our X

#The variable 'dep_time_min' its commented because I'm not sure I can use it in this case
dfzx = dfz.drop(['actual_elapsed_time',
                 'dep_delay',
                 'arr_delay',
                 'origin_airport_mean_delay',
                 'dest_airport_mean_delay',
                 'dest_airport_mean_elapsed_time',
                 'mean_depTime_origin_airport',
                 'crs_dep_time_min',
#                  'dep_time_min',
                ],axis=1)

In [59]:
dfzx

Unnamed: 0,year,month,day,carrier_mean_delay,distance,air_time,crs_elapsed_time,dep_time_min,op_unique_carrier_9K,op_unique_carrier_AA,op_unique_carrier_AS,op_unique_carrier_AX,op_unique_carrier_B6,op_unique_carrier_C5,op_unique_carrier_CP,op_unique_carrier_DL,op_unique_carrier_EM,op_unique_carrier_EV,op_unique_carrier_F9,op_unique_carrier_G4,op_unique_carrier_G7,op_unique_carrier_HA,op_unique_carrier_KS,op_unique_carrier_MQ,op_unique_carrier_NK,op_unique_carrier_OH,op_unique_carrier_OO,op_unique_carrier_PT,op_unique_carrier_QX,op_unique_carrier_UA,op_unique_carrier_VX,op_unique_carrier_WN,op_unique_carrier_YV,op_unique_carrier_YX,op_unique_carrier_ZW
0,2018,03,07,6.070661,304.0,52.0,76.0,513,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2018,03,07,6.070661,331.0,55.0,88.0,937,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2018,03,07,6.070661,632.0,91.0,126.0,357,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2018,03,07,6.070661,728.0,92.0,137.0,689,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2018,03,07,6.070661,842.0,133.0,162.0,1092,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389884,2018,03,07,10.282094,388.0,61.0,87.0,885,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
389885,2018,03,07,10.282094,1530.0,188.0,220.0,776,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
389886,2018,03,07,3.219832,594.0,93.0,128.0,955,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
389887,2018,03,07,3.219832,1013.0,141.0,174.0,1135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


### Start the model and define the "X" and "y" used for the future polynomial linear regression

In [60]:
from sklearn.preprocessing import PolynomialFeatures

X = dfzx
y = dfzy

pol = 2
poly = PolynomialFeatures(pol)

### Scale the data using StandardScaler

In [61]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
# y_scaled = scaler.fit_transform(y.values.reshape(-1,1))

### Fit the scaled data into the polinomial model and after that create the train and test enviroment

In [62]:
Xt = poly.fit_transform(X_scaled)

X_test, X_train, y_test, y_train = train_test_split(Xt,y, train_size=0.7, random_state=123)

### Instatiate the linear regression model, fit the train X and y variables and create the "y_predict" prediction variable

In [63]:
model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

### Train, test and score the model

In [64]:
r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)

### Print out the ressults

In [65]:
print("---Results---")
print(f"degree = {pol}")
print(f"Train score = {r2_train}")
print(f"Test score = {r2_test}")

---Results---
degree = 2
Train score = 0.05822640007024771
Test score = 0.04997280416208816


In [66]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 23.291247631336915
Mean Squared Error: 2361.4402712885085
Root Mean Squared Error: 48.594652702622625


In [67]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

In [68]:
df.head(25)

Unnamed: 0,Actual,Predicted
286950,-5.0,3.611126
299672,-29.0,15.480667
272814,30.0,4.85009
215039,-1.0,14.087574
55332,-4.0,-4.786331
372110,-16.0,3.106033
10818,-14.0,-5.82991
186658,-24.0,-10.59906
190162,-26.0,-4.434319
307581,113.0,-10.449905
