# CS480 Ridesharing Project

Algorithm
- filter by passengers (single riders and double riders)
    - 1 rider + 1 rider
    - 1 rider + 2 rider
- filter by the time window (difference between each trip from one another)
- group by location (PULocationID, DULocationID)
- social preferences (do they want to talk, do they want to share a ride, etc. but that info is not availible)

In [1]:
import pandas as pd
import numpy as np

## 1 Setup

### 1.1 Load Yellow Trip from December 2019 dataset

In [20]:
# Load Yellow Trip from December 2019 dataset
yellow_taxis = pd.read_csv('yellow_tripdata_2014-01.csv', dtype={'store_and_fwd_flag': object})
yellow_taxis.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,1,N,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9
1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,1,N,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4
2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,1,N,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0
3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,1,N,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2
4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,1,N,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75


In [21]:
yellow_taxis.rename(columns=lambda x: x.strip(), inplace=True)

In [36]:
green_taxis = pd.read_csv('green_tripdata_2014-01.csv')
green_taxis.head()

Unnamed: 0,Unnamed: 1,VendorID,lpep_pickup_datetime,Lpep_dropoff_datetime,Store_and_fwd_flag,RateCodeID,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,Passenger_count,Trip_distance,Fare_amount,Extra,MTA_tax,Tip_amount,Tolls_amount,Ehail_fee,Total_amount,Payment_type,Trip_type
2,2014-01-01 00:00:00,2014-01-01 01:08:06,N,1,0.0,0.0,-73.865044,40.872307,1,6.47,20.0,0.5,0.5,0.0,0.0,,21.0,1,,,
2,2014-01-01 00:00:00,2014-01-01 06:03:57,N,2,0.0,0.0,-73.776367,40.645489,1,20.12,52.0,0.0,0.5,0.0,5.33,,57.83,1,,,
2,2014-01-01 00:00:00,2014-01-01 18:22:44,N,1,0.0,0.0,-73.932648,40.852573,2,0.81,5.0,0.5,0.5,0.0,0.0,,6.0,1,,,
2,2014-01-01 00:00:00,2014-01-01 00:52:03,N,1,0.0,0.0,-73.99408,40.749092,1,9.55,33.5,0.5,0.5,2.17,5.33,,42.0,1,,,
2,2014-01-01 00:00:00,2014-01-01 00:49:25,N,1,0.0,0.0,-73.936066,40.734726,1,1.22,7.0,0.5,0.5,2.0,0.0,,10.0,1,,,


In [22]:
yellow_taxis.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude', 'rate_code',
       'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude',
       'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount',
       'tolls_amount', 'total_amount'],
      dtype='object')

#### 1.1.a Keep only needed columns 

In [23]:
#Keep only these specific columns that will help with calculations, other info unnecessary
yellow_taxis = yellow_taxis[[
    'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude'
]]


#### 1.1.a Cleaning Data of NaN Values


In [24]:
#remove any NaN cell values
yellow_taxis = yellow_taxis.dropna()

#### 1.1.b Removing trip distance that is less than or equal to zero


In [25]:
#keep only trips with a distance greater than zero
yellow_taxis = yellow_taxis[yellow_taxis['trip_distance'] > 0]

#### 1.1.c Keep only trips with passenger count of 1 or 2

In [26]:
#Keep only trips with passenger count of 1 or 2
yellow_taxis = yellow_taxis[(yellow_taxis['passenger_count'] == 2) | (yellow_taxis['passenger_count'] == 1)]


#### 1.1.e Convert datetime of pickup and dropoff to Timestamp object in Pandas


In [30]:
yellow_taxis.pickup_datetime = pd.to_datetime(yellow_taxis.pickup_datetime, dayfirst=True)
yellow_taxis.dropoff_datetime = pd.to_datetime(yellow_taxis.dropoff_datetime,dayfirst=True)

####  1.1.f Clean up year

In [31]:
for i in range(0,len(yellow_taxis)):
        try:
            yellow_taxis.tpep_pickup_datetime[i].replace(year = 2014)
        except:
            continue

        

In [None]:
yellow_taxis = yellow_taxis.reset_index(drop=True)

### 1.2 Total rides traveled in yellow taxis in December 2019

In [None]:
total_rides = len(yellow_taxis.index)
print("There were {0} total rides in yellow taxis in December 2019.".format(total_rides))

### 1.3 Total Distance traveled in all the trips 

In [None]:
total_distance = yellow_taxis['trip_distance'].sum()     
print("Total distance is {0} miles in yellow taxis in December 2019.".format(total_distance))

## 2 Filter Passengers

### 2.1.a Gather the single riders

In [None]:
# the rides in yellow taxis that can be combined based on the number of passengers
# and the drop off is within the 50 - 100 drop off region
single_riders = yellow_taxis.loc[yellow_taxis['passenger_count'] == 1.0]
single_riders = single_riders.reset_index(drop=True)
single_riders

### 2.1.b Total rides traveled in yellow taxis with a single rider in December 2019

In [None]:
num_single_riders = len(single_riders.index)
print("There were {0} total rides with single riders in yellow taxis in December 2019.".format(num_single_riders))

### 2.1.c Total distance traveled in yellow taxis with single riders in December 2019

In [None]:
total_distance_single_riders = single_riders['trip_distance'].sum()     
print("Total distance is {0} miles in yellow taxis with single riders in December 2019.".format(total_distance_single_riders))

### 2.2.a Gather the double riders

In [None]:
# the rides in yellow taxis that can be combined based on the number of passengers
double_riders = yellow_taxis.loc[yellow_taxis['passenger_count'] == 2.0]
double_riders = double_riders.reset_index(drop = True)
double_riders

### 2.2.b Total rides traveled in yellow taxis with double riders in December 2019

In [None]:
num_double_riders = len(double_riders.index)
print("There were {0} total rides with single riders in yellow taxis in December 2019.".format(num_double_riders))

### 2.2.c Total miles traveled in yellow taxis with double riders in December 2019

In [None]:
total_distance_double_riders = double_riders['trip_distance'].sum()     
print("Total distance is {0} miles in yellow taxis with single riders in December 2019.".format(total_distance_double_riders))

## 3 Filter By Pick Up and Drop Off Location

### 3.1.a Filtering the single riders by the pickup id from 0-50

In [None]:
single_0_50 = single_riders.loc[(single_riders['PULocationID'] > 0) & (single_riders['DOLocationID'] > 0)]
single_0_50 = single_0_50.loc[(single_0_50['PULocationID'] < 50) & (single_0_50['DOLocationID'] < 50)]
single_0_50

### 3.1.b Total single riders being picked up AND dropped off in locationIDS from 0-50

In [None]:
num_single_riders_0_50 = len(single_0_50.index)
print("There were {0} rides with single riders with pickups and drop offs in locationIDs within 0-50 in December 2019.".format(num_single_riders_0_50))

### 3.2.a Filtering the double riders by the pickup id from 0-50

In [None]:
double_riders_0_50 = double_riders.loc[(double_riders['PULocationID'] > 0) & (double_riders['DOLocationID'] > 0)]
double_riders_0_50 = double_riders_0_50.loc[(double_riders_0_50['PULocationID'] < 50) & (double_riders_0_50['DOLocationID'] < 50)]
double_riders_0_50

### 3.2.b Total double riders being picked up AND dropped off in locationIDS from 0-50

In [None]:
num_double_riders_0_50 = len(double_riders_0_50.index)
print("There were {0} total rides with double riders in being picked up AND dropped off at locationIDs from 0-50 in December 2019.".format(num_double_riders_0_50))