In [56]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
import csv

airlines=pd.read_csv('airlines.csv')
airports = pd.read_csv('airports.csv')
flights = pd.read_csv('flights.csv')

In [57]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

#selecting and downsizing columns of type int:
flights_int = flights.select_dtypes(include=['int'])
flights_int = flights_int.apply(pd.to_numeric,downcast='unsigned')

#selecting and downsizing columns of type float:
flights_float = flights.select_dtypes(include=['float'])
flights_float = flights_float.apply(pd.to_numeric,downcast='float')

#selecting and factorizing columns of type object, where column has <50% of unique values:
flights_obj = flights.select_dtypes(include=['object']).copy()

for col in flights_obj.columns:
    num_unique_values = len(flights_obj[col].unique())
    num_total_values = len(flights_obj[col])
    if num_unique_values / num_total_values < 0.5:
        flights_obj.loc[:,col] = flights_obj[col].astype('category')
    else:
        flights_obj.loc[:,col] = flights_obj[col]


flights[flights_int.columns] = flights_int
flights[flights_float.columns] = flights_float
flights[flights_obj.columns] = flights_obj

#airlines:
#selecting and factorizing columns of type object, where column has <50% of unique values:
airlines_obj = airlines.select_dtypes(include=['object']).copy()

for col in airlines_obj.columns:
    num_unique_values_airl = len(airlines_obj[col].unique())
    num_total_values_airl = len(airlines_obj[col])
    if num_unique_values_airl / num_total_values_airl < 0.5:
        airlines_obj.loc[:,col] = airlines_obj[col].astype('category')
    else:
        airlines_obj.loc[:,col] = airlines_obj[col]


airlines[airlines_obj.columns] = airlines_obj

#airports:
#selecting and downsizing columns of type float:
airports_float = airports.select_dtypes(include=['float'])
airports_float = airports_float.apply(pd.to_numeric,downcast='float')

#selecting and factorizing columns of type object, where column has <50% of unique values:
airports_obj = airports.select_dtypes(include=['object']).copy()

for col in airports_obj.columns:
    num_unique_values = len(airports_obj[col].unique())
    num_total_values = len(airports_obj[col])
    if num_unique_values / num_total_values < 0.5:
        airports_obj.loc[:,col] = airports_obj[col].astype('category')
    else:
        airports_obj.loc[:,col] = airports_obj[col]

airports[airports_float.columns] = airports_float
airports[airports_obj.columns] = airports_obj

print(mem_usage(flights))
print(mem_usage(airlines))
print(mem_usage(airports))

483.00 MB
0.00 MB
0.00 MB


In [58]:
flights = pd.merge(flights, airlines, how = 'left', left_on = 'AIRLINE', right_on = 'IATA_CODE', sort = False)
flights = pd.merge(flights, airports, how = 'left', left_on = 'ORIGIN_AIRPORT', right_on = 'IATA_CODE', sort = False, suffixes = ('_x','_orig'))
flights = pd.merge(flights, airports, how = 'left', left_on = 'DESTINATION_AIRPORT', right_on = 'IATA_CODE', sort = False, suffixes = ('_x','_dest'))

In [61]:
mem_usage(flights)

'716.00 MB'

In [60]:
#selecting and downsizing columns of type int:
flights_int = flights.select_dtypes(include=['int'])
flights_int = flights_int.apply(pd.to_numeric,downcast='unsigned')

#selecting and downsizing columns of type float:
flights_float = flights.select_dtypes(include=['float'])
flights_float = flights_float.apply(pd.to_numeric,downcast='float')

#selecting and factorizing columns of type object, where column has <50% of unique values:
flights_obj = flights.select_dtypes(include=['object']).copy()

for col in flights_obj.columns:
    num_unique_values = len(flights_obj[col].unique())
    num_total_values = len(flights_obj[col])
    if num_unique_values / num_total_values < 0.5:
        flights_obj.loc[:,col] = flights_obj[col].astype('category')
    else:
        flights_obj.loc[:,col] = flights_obj[col]


flights[flights_int.columns] = flights_int
flights[flights_float.columns] = flights_float
flights[flights_obj.columns] = flights_obj

In [69]:
print(flights.columns)
from sklearn.model_selection import train_test_split

test,train = train_test_split(flights, test_size = 0.25)

Index([u'YEAR', u'MONTH', u'DAY', u'DAY_OF_WEEK', u'AIRLINE_x',
       u'FLIGHT_NUMBER', u'TAIL_NUMBER', u'ORIGIN_AIRPORT',
       u'DESTINATION_AIRPORT', u'SCHEDULED_DEPARTURE', u'DEPARTURE_TIME',
       u'DEPARTURE_DELAY', u'TAXI_OUT', u'WHEELS_OFF', u'SCHEDULED_TIME',
       u'ELAPSED_TIME', u'AIR_TIME', u'DISTANCE', u'WHEELS_ON', u'TAXI_IN',
       u'SCHEDULED_ARRIVAL', u'ARRIVAL_TIME', u'ARRIVAL_DELAY', u'DIVERTED',
       u'CANCELLED', u'CANCELLATION_REASON', u'AIR_SYSTEM_DELAY',
       u'SECURITY_DELAY', u'AIRLINE_DELAY', u'LATE_AIRCRAFT_DELAY',
       u'WEATHER_DELAY', u'IATA_CODE_x', u'AIRLINE_y', u'IATA_CODE_orig',
       u'AIRPORT_x', u'CITY_x', u'STATE_x', u'COUNTRY_x', u'LATITUDE_x',
       u'LONGITUDE_x', u'IATA_CODE', u'AIRPORT_dest', u'CITY_dest',
       u'STATE_dest', u'COUNTRY_dest', u'LATITUDE_dest', u'LONGITUDE_dest'],
      dtype='object')


In [149]:
train['SCHEDULED_DEPARTURE'].head()

5157185    2215
3015103    1720
5251887     720
833924     1759
2628762    1405
Name: SCHEDULED_DEPARTURE, dtype: uint16

In [155]:
train_x = train[['YEAR','MONTH','DAY','DAY_OF_WEEK', 'AIRLINE_x', 'STATE_x', 'STATE_dest','SCHEDULED_DEPARTURE']]
train_x['SCHEDULED_DEPARTURE'] = train_x['SCHEDULED_DEPARTURE'].apply(lambda x: 1 if x > 360 & x < 660 else 2 if x < 960 else 3 if x < 1380 else 4)
train_y = train[['ARRIVAL_DELAY']]
#print(train_x.head())
#print(train_y.head())

test_x = test[['YEAR','MONTH','DAY','DAY_OF_WEEK', 'AIRLINE_x', 'STATE_x', 'STATE_dest','SCHEDULED_DEPARTURE']]
test_x['SCHEDULED_DEPARTURE'] = test_x['SCHEDULED_DEPARTURE'].apply(lambda x: 1 if x > 360 & x < 660 else 2 if x < 960 else 3 if x < 1380 else 4)
test_y = test[['ARRIVAL_DELAY']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [156]:
test_x.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4364309 entries, 5095081 to 1208655
Data columns (total 8 columns):
YEAR                   uint16
MONTH                  uint8
DAY                    uint8
DAY_OF_WEEK            uint8
AIRLINE_x              category
STATE_x                category
STATE_dest             category
SCHEDULED_DEPARTURE    int64
dtypes: category(3), int64(1), uint16(1), uint8(3)
memory usage: 99.9 MB


In [157]:
import numpy as np
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

for col in train_x.columns:
    if train_x[col].dtype.name == 'category':
        train_x.loc[:,col] = le.fit_transform(train_x[col])
        
#selecting and downsizing columns of type int:
train_x_int = train_x.select_dtypes(include=['int'])
train_x_int = train_x_int.apply(pd.to_numeric,downcast='unsigned')
train_x[train_x_int.columns] = train_x_int

train_y.fillna(0, inplace = True)

In [158]:
for col in test_x.columns:
    if test_x[col].dtype.name == 'category':
        test_x.loc[:,col] = le.fit_transform(test_x[col])
        
#selecting and downsizing columns of type int:
test_x_int = test_x.select_dtypes(include=['int'])
test_x_int = test_x_int.apply(pd.to_numeric,downcast='unsigned')
test_x[test_x_int.columns] = test_x_int

test_y.fillna(0, inplace = True)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(train_x,train_y)
prediction = gnb.predict(test_x)

test_df =  pd.DataFrame(data = prediction,columns= ['prediction'])

final = pd.DataFrame(data = test_x)

final['Arrival_delay'] = test_df['prediction']
final.head()

In [None]:
print("Number of mislabeled points out of a total %d points : %d" % (test_x.shape[0],(test_y != prediction).sum()))