# Project: Taxi Data

# Data

In [None]:
The data is structured as follows:

Column number | Column name | Type | Description
 ------------ | :---------:       | :---------:           | ------------:
0             | `'pickup_weekday'`    | categorical (ordinal) | Day of the week when the journey started (Monday = 0, Sunday = 6).
1             | `'pickup_hour'`    | categorical (ordinal) | Hour when the journey started.
2             | `'pickup_longitude'`  | numerical             | Longitude where the journey started.
3             | `'pickup_latitude'`   | numerical             | Latitude where the journey started.
4             | `'dropoff_longitude'` | numerical             | Longitude where the journey ended.
5             | `'pickup_latitude'`  | numerical             | Latitude where the journey ended.
6             | `'passenger_count'`   | categorical (ordinal) | Number of passengers in the car. This is manually recorded.
7             | `'trip_distance'`     | numerical             | Journey distance in miles.
8             | `'fare_amount'`       | numerical             | Amount on the meter based on duration and distance.
9             | `'tip_amount'`        | numerical             | Tip given on card payments (0.00 if payment made in cash).
10            | `'tolls_amount'`      | numerical             | Tolls incurred.
11            | `'payment_type'`      | categorical (nominal) | Payment type (1 = credit card, 2 = cash, 3 = no fee, 4 = dispute).


You are also given the following coordinates to determine which journeys start JFK airport:

Variable      | Value             | Description  
------------- | :---------:       | ---------:           
`jfk_max_lat`   | `40.66018`          | Maximum pickup latitude for airport journeys 
`jfk_min_lat`   | `40.62666`          | Minimum pickup latitude for airport journeys 
`jfk_max_long`  | `-73.76599`         | Maximum pickup longitude for airport journeys 
`jfk_min_long`  | `-73.80822`         | Minimum pickup longitude for airport journeys


New York City is determined approximately by the following coordinates:

Variable      | Value             | Description  
------------- | :---------:       | ---------:           
`nyc_max_lat`   | `40.9176`          | Maximum latitude for New York City
`nyc_min_lat`   | `40.5774`          | Minimum latitude for New York City 
`nyc_max_long`  | `-73.7004`         | Maximum longitude for New York City
`nyc_min_long`  | `-74.15`         | Minimum longitude for New York City


# **1)** Importing and cleaning the data

In [None]:
col_dtypes = {'pickup_weekday': 'int16', 
              'pickup_hour': 'int16', 
              'pickup_longitude': 'float32', 
              'pickup_latitude': 'float32', 
              'dropoff_longitude': 'float32', 
              'dropoff_latitude': 'float32', 
              'passenger_count': 'int16', 
              'trip_distance': 'float32', 
              'fare_amount': 'float32', 
              'tip_amount': 'float32', 
              'tolls_amount': 'float32', 
              'payment_type': 'int16'}

df = pd.read_csv('2016_Yellow_Taxi_prepared.csv', dtype=col_dtypes)

In [None]:
import pandas as pd
col_dtypes = {'pickup_weekday': 'int16', 
              'pickup_hour': 'int16', 
              'pickup_longitude': 'float32', 
              'pickup_latitude': 'float32', 
              'dropoff_longitude': 'float32', 
              'dropoff_latitude': 'float32', 
              'passenger_count': 'int16', 
              'trip_distance': 'float32', 
              'fare_amount': 'float32', 
              'tip_amount': 'float32', 
              'tolls_amount': 'float32', 
              'payment_type': 'int16'}


In [None]:
df = pd.read_csv('2016_Yellow_Taxi_prepared.csv', dtype=col_dtypes)
df.head()

In [None]:
df.dtypes
df.index = df.index 
df.head()

In [None]:
df.describe()

In [None]:
import numpy as np 
for i in range(df.shape[0]):
    if df.loc[i, 'pickup_latitude'] <40.5774 or  df.loc[i, 'pickup_latitude'] >40.9176 :
        df.loc[i,'pickup_latitude'] = np.nan
    elif df.loc[i, 'dropoff_latitude']<40.5774 or df.loc[i, 'dropoff_latitude'] >40.9176:
        df.loc[i, 'dropoff_latitude'] = np.nan
    elif df.loc[i, 'pickup_longitude']>-73.7004 or df.loc[i, 'pickup_longitude']<-74.15:
        df.loc[i, 'pickup_longitude'] = np.nan
    elif  df.loc[i, 'dropoff_longitude']> -73.7004  or df.loc[i, 'dropoff_longitude']<-74.15:
        df.loc[i,'dropoff_longitude'] = np.nan
    elif df.loc[i, 'passenger_count'] == 0:
        df.loc[i, 'passenger_count']=np.nan
    elif df.loc[i,'trip_distance']<=0 :
        df.loc[i,'trip_distance']=np.nan
    elif df.loc[i, 'tip_amount']<0:
        df.loc[i, 'tip_amount'] = np.nan

# **2)** Selecting data

In [None]:
df = df.dropna(axis = 0,thresh = 12)
df.shape

# **3)** Proportion of taxis from the airport

In [None]:
mask_latitude = (df.loc[:, 'pickup_latitude'] >40.62666) &  (df.loc[:, 'pickup_latitude'] <40.66018)
mask_longtitude = (df.loc[:, 'pickup_longitude']<-73.76599) & (df.loc[:, 'pickup_longitude']>-73.80822)
mask_jfk = mask_latitude & mask_longtitude
df_jfk_start = df.loc[mask_jfk, :]
proportion_jfk = (df_jfk_start.shape[0]/(df.shape[0]-df_jfk_start.shape[0]))*100
proportion_jfk

In [None]:
df.dtypes

# **4)** Visualizing the starting points

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots()
df.plot(kind = 'scatter', x = 'pickup_longitude', y =  'pickup_latitude', s=0.03, alpha = 0.03,
        title = 'Taxi pick-ups in New-York', ax=ax, xlim = [-74.05, -73.75], color = 'darkblue')

ann =ax.annotate(s = 'JFK airport', xytext = [-73.87, 40.58], xy=[-73.80,40.63], arrowprops=dict(facecolor='black'))

# **5)** Proportion of airport taxis on each day

In [None]:
prop_day_of_the_week = []
for i in range(7):
    mask = (df.loc[:, 'pickup_weekday']==i) 
    mask_jfk1 = mask & mask_jfk
    df_day_of_week = df.loc[mask_jfk1,:]
    df_day = df.loc[mask, :]
    res = (df_day_of_week.shape[0]/df_day.shape[0])*100
    prop_day_of_the_week.append(res)
prop_day_of_the_week

# **6)** Proportion of journeys on each day of the week from all locations and those starting from the airport

In [None]:
mask_latitude = (df.loc[:, 'pickup_latitude'] >40.62666) &  (df.loc[:, 'pickup_latitude'] <40.66018)
mask_longtitude = (df.loc[:, 'pickup_longitude']<-73.76599) & (df.loc[:, 'pickup_longitude']>-73.80822)
mask_jfk = mask_latitude & mask_longtitude

In [None]:
prop_day_of_the_week =pd.DataFrame(index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], 
                                   columns = ['prop_all_loc', 'prop_jfk'])

In [None]:
for i in range(7):
    mask = (df.loc[:, 'pickup_weekday']==i) #day of the week
    mask_jfk1 = mask & mask_jfk 
    
    df_day_of_week_jfk = df.loc[mask_jfk1,:]
    df_day = df.loc[mask, :]
    df_jfk = df.loc[mask_jfk, :]
    
    res1 = (df_day.shape[0]/df.shape[0])
    res2 = (df_day_of_week_jfk.shape[0]/df_jfk.shape[0])
    prop_day_of_the_week.iloc[i, 0] = res1
    prop_day_of_the_week.iloc[i , 1] = res2

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = [15,4], sharey = True)

# week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
plt.style.use('fivethirtyeight')
color = ['blue', 'red', 'orange', 'green', 'grey', 'purple', 'darkblue']
prop_day_of_the_week.loc[:, 'prop_all_loc'].plot( kind = 'bar', ax=ax[0], ylim = [0, 0.23], 
                                                 color = color, 
                                                 title = 'Proportion of journeys per week day')

prop_day_of_the_week.loc[:, 'prop_jfk'].plot(kind = 'bar', ax=ax[1], ylim =[0,0.23], 
                                             color = color, 
                                             title ='Proportion of airport journeys per week day')

fig.subplots_adjust(wspace=0.01, hspace=0)

ax[0].xaxis.set_tick_params(labelrotation=0)
ax[1].xaxis.set_tick_params(labelrotation=0)

ax[0].set( ylabel = 'Proportion of journeys', xlabel ='Day of the week')
ax[1].set(xlabel ='Day of the week');

# **7)** Proportion each hour for all journeys and journeys from the airport

In [None]:
#'pickup_hour'
mask_latitude = (df.loc[:, 'pickup_latitude'] >40.62666) &  (df.loc[:, 'pickup_latitude'] <40.66018)
mask_longtitude = (df.loc[:, 'pickup_longitude']<-73.76599) & (df.loc[:, 'pickup_longitude']>-73.80822)
mask_jfk = mask_latitude & mask_longtitude

In [None]:
hour =[]
for i in range(24):
    hour.append(i)
df_prop_hour = pd.DataFrame(index = hour, columns = ['prop_hour_all_loc', 'prop_hour_jfk'])


In [None]:
for i in range(24):
    mask_hour = (df.loc[:, 'pickup_hour']) ==i
    df_hour = df.loc[mask_hour, :]
    res3 = (df_hour.shape[0]/df.shape[0])*100
    df_prop_hour.iloc[i, 0] =res3
    
    mask_hour_jfk = mask_jfk & mask_hour
    df_hour_jfk = df.loc[mask_hour_jfk, :]
    df_jfk = df.loc[mask_jfk, :]
    res4 = (df_hour_jfk.shape[0]/df_jfk.shape[0])*100
    df_prop_hour.iloc[i, 1] =res4

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize =[15,5], sharey = True)

df_prop_hour.iloc[:, 0].plot(ax=ax[0], title = 'Proportion each hour for all journeys', ylim = [0, 8])
df_prop_hour.iloc[:, 1].plot(ax=ax[1], title = 'Proportion each hour for all airport journeys')

fig.subplots_adjust(wspace=0.01, hspace=0)

ax[0].set( ylabel = 'Proportion of journeys', xlabel ='Hour')
ax[1].set(xlabel ='Hour');