In [1]:
import pandas as pd
import numpy as np

In [2]:
fdata = pd.read_csv('flight_data.csv')
fdata.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,01-01-2013 05:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,01-01-2013 05:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,01-01-2013 05:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,01-01-2013 05:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,01-01-2013 06:00


In [3]:
fdata.dtypes

year                int64
month               int64
day                 int64
dep_time          float64
sched_dep_time      int64
dep_delay         float64
arr_time          float64
sched_arr_time      int64
arr_delay         float64
carrier            object
flight              int64
tailnum            object
origin             object
dest               object
air_time          float64
distance            int64
hour                int64
minute              int64
time_hour          object
dtype: object

# Note :
1. If a column contains numbers and NaNs pandas will default to float64, in case your missing value has a decimal.
2. This means that our flight_dataset contains null values and we need to remove them 

# How to Check Null Values in Dataset ? : 

[Ref Link https://chartio.com/resources/tutorials/how-to-check-if-any-value-is-nan-in-a-pandas-dataframe/ ]

In [4]:
len(fdata)

336776

In [5]:
fdata.isnull().count()


year              336776
month             336776
day               336776
dep_time          336776
sched_dep_time    336776
dep_delay         336776
arr_time          336776
sched_arr_time    336776
arr_delay         336776
carrier           336776
flight            336776
tailnum           336776
origin            336776
dest              336776
air_time          336776
distance          336776
hour              336776
minute            336776
time_hour         336776
dtype: int64

In [6]:
fdata.count()

year              336776
month             336776
day               336776
dep_time          328521
sched_dep_time    336776
dep_delay         328521
arr_time          328063
sched_arr_time    336776
arr_delay         327346
carrier           336776
flight            336776
tailnum           334264
origin            336776
dest              336776
air_time          327346
distance          336776
hour              336776
minute            336776
time_hour         336776
dtype: int64

In [7]:
fdata.isnull().sum()

year                 0
month                0
day                  0
dep_time          8255
sched_dep_time       0
dep_delay         8255
arr_time          8713
sched_arr_time       0
arr_delay         9430
carrier              0
flight               0
tailnum           2512
origin               0
dest                 0
air_time          9430
distance             0
hour                 0
minute               0
time_hour            0
dtype: int64

# Check NAN  with Faster Method :

In [8]:
%timeit fdata.isnull().sum().sum()


165 ms ± 7.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
%timeit fdata.isnull().any().any()

160 ms ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
%timeit fdata.isnull().values.sum()

99.2 ms ± 5.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%timeit fdata.isnull().values.any()

91.8 ms ± 6.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
fdata.isnull().any().any()

True

In [13]:
fdata.isnull().values.sum()

46595

In [14]:
fdata.isnull().sum().sum()

46595

In [15]:
fdata.isnull().values.any()

True

# Which Data Column Contains Null/ NAN Values ? :

In [16]:
fdata.isnull().count

<bound method DataFrame.count of          year  month    day  dep_time  sched_dep_time  dep_delay  arr_time  \
0       False  False  False     False           False      False     False   
1       False  False  False     False           False      False     False   
2       False  False  False     False           False      False     False   
3       False  False  False     False           False      False     False   
4       False  False  False     False           False      False     False   
5       False  False  False     False           False      False     False   
6       False  False  False     False           False      False     False   
7       False  False  False     False           False      False     False   
8       False  False  False     False           False      False     False   
9       False  False  False     False           False      False     False   
10      False  False  False     False           False      False     False   
11      False  False  False    

# How to convert Float Values to int and Remove NaN :

In [17]:

fdata.dtypes

year                int64
month               int64
day                 int64
dep_time          float64
sched_dep_time      int64
dep_delay         float64
arr_time          float64
sched_arr_time      int64
arr_delay         float64
carrier            object
flight              int64
tailnum            object
origin             object
dest               object
air_time          float64
distance            int64
hour                int64
minute              int64
time_hour          object
dtype: object

In [18]:
dep_time = fdata['dep_time'].astype(np.int64)

ValueError: Cannot convert non-finite values (NA or inf) to integer

# Drop all rows which contains NaN Value in any cell

In [19]:
fdata = fdata.dropna(axis=0, how='any')

In [20]:
fdata.shape

(327346, 19)

In [21]:
new_fdata.shape

NameError: name 'new_fdata' is not defined

In [None]:
new_fdata.head()

In [None]:
missing_Rows = len(fdata) - len(new_fdata)
missing_Rows

<b> Q: Is it affordable to drop these many rows (with NaN values in some columns) from Dataset ? If yes then proceed or else think can we do ? fillna --> Give it a Try 

#  Exploration ideas : 
# 1. <U> Departure Delays 

In [22]:
fdata.dtypes

year                int64
month               int64
day                 int64
dep_time          float64
sched_dep_time      int64
dep_delay         float64
arr_time          float64
sched_arr_time      int64
arr_delay         float64
carrier            object
flight              int64
tailnum            object
origin             object
dest               object
air_time          float64
distance            int64
hour                int64
minute              int64
time_hour          object
dtype: object

In [23]:
fdata[['dep_time','dep_delay']] = fdata[['dep_time','dep_delay']].astype(np.int64) 



In [24]:
fdata.dtypes

year                int64
month               int64
day                 int64
dep_time            int64
sched_dep_time      int64
dep_delay           int64
arr_time          float64
sched_arr_time      int64
arr_delay         float64
carrier            object
flight              int64
tailnum            object
origin             object
dest               object
air_time          float64
distance            int64
hour                int64
minute              int64
time_hour          object
dtype: object

In [25]:
fdata.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517,515,2,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,01-01-2013 05:00
1,2013,1,1,533,529,4,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,01-01-2013 05:00
2,2013,1,1,542,540,2,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,01-01-2013 05:00
3,2013,1,1,544,545,-1,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,01-01-2013 05:00
4,2013,1,1,554,600,-6,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,01-01-2013 06:00


In [41]:
fdata['dep_delay'].max()

1301

In [40]:
fdata.describe()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,flight,air_time,distance,hour,minute
count,327346.0,327346.0,327346.0,327346.0,327346.0,327346.0,327346.0,327346.0,327346.0,327346.0,327346.0,327346.0,327346.0,327346.0
mean,2013.0,6.564803,15.740825,1348.789883,1340.335098,12.555156,1501.908238,1532.788426,6.895377,1943.104501,150.68646,1048.371314,13.14101,26.234116
std,0.0,3.413444,8.777376,488.319979,467.413156,40.065688,532.888731,497.979124,44.633292,1621.523684,93.688305,735.908523,4.662063,19.295918
min,2013.0,1.0,1.0,1.0,500.0,-43.0,1.0,1.0,-86.0,1.0,20.0,80.0,5.0,0.0
25%,2013.0,4.0,8.0,907.0,905.0,-5.0,1104.0,1122.0,-17.0,544.0,82.0,509.0,9.0,8.0
50%,2013.0,7.0,16.0,1400.0,1355.0,-2.0,1535.0,1554.0,-5.0,1467.0,129.0,888.0,13.0,29.0
75%,2013.0,10.0,23.0,1744.0,1729.0,11.0,1940.0,1944.0,14.0,3412.0,192.0,1389.0,17.0,44.0
max,2013.0,12.0,31.0,2400.0,2359.0,1301.0,2400.0,2359.0,1272.0,8500.0,695.0,4983.0,23.0,59.0


In [42]:
fdata[fdata['dep_delay']==fdata['dep_delay'].max()]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
7072,2013,1,9,641,900,1301,1242.0,1530,1272.0,HA,51,N384HA,JFK,HNL,640.0,4983,9,0,09-01-2013 09:00


In [56]:
fdata.iloc[fdata['dep_delay'].max()]['time_hour']

'02-01-2013 13:00'