In [1]:
import pandas as pd
from glob import glob
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

## load the data

In [2]:
file_list = glob('data/*.csv')
file_list

['data/NY Weather Data - Apr 14 - Jul 14.csv',
 'data/uber-raw-data-apr14.csv',
 'data/uber-raw-data-jun14.csv',
 'data/uber-raw-data-may14.csv']

In [3]:
file_list = glob('data/*.csv')
df = pd.DataFrame()
for file in file_list[1:]:
    df = df.append(pd.read_csv(file))

In [4]:
df = df.reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


## data wrangling in weather dataset

In [7]:
weather = pd.read_csv('data/NY Weather Data - Apr 14 - Jul 14.csv')

In [8]:
weather['Unnamed: 20'] = weather['Unnamed: 20'].astype('str')

In [9]:
def rain(x):
    if 'Rain' in x:
        return 'Rain'
    else:
        return 'NoRain'

In [10]:
weather['weather'] = weather['Unnamed: 20'].apply(rain)

In [11]:
weather = weather[['2014', 'weather']]

In [12]:
apr = weather.iloc[1:31,:]
apr['2014'] = apr['2014'].apply(lambda x:'2014-04-'+x)
apr['2014'] = pd.to_datetime(apr['2014'])

In [13]:
may = weather.iloc[33:64, :]
may['2014'] = may['2014'].apply(lambda x:'2014-05-'+x)
may['2014'] = pd.to_datetime(may['2014'])

In [14]:
jun = weather.iloc[66:96, :]
jun['2014'] = jun['2014'].apply(lambda x:'2014-06-'+x)
jun['2014'] = pd.to_datetime(jun['2014'])

In [15]:
weather = apr.append([may, jun])

In [16]:
df['Date/Time'] = pd.to_datetime(df['Date/Time'])
df['date'] = df['Date/Time'].dt.strftime('%m-%d')

In [17]:
weather['2014'] = weather['2014'].dt.strftime('%m-%d')

In [18]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,date
0,2014-04-01 00:11:00,40.769,-73.9549,B02512,04-01
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,04-01
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,04-01
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,04-01
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,04-01


In [19]:
weather.head()

Unnamed: 0,2014,weather
1,04-01,NoRain
2,04-02,Rain
3,04-03,Rain
4,04-04,Rain
5,04-05,Rain


## Merge the uber trip data with weather data

In [20]:
df = df.merge(weather, left_on='date', right_on='2014' )

In [21]:
df = df[['Date/Time', 'Lat', 'Lon', 'weather']]

In [22]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,weather
0,2014-04-01 00:11:00,40.769,-73.9549,NoRain
1,2014-04-01 00:17:00,40.7267,-74.0345,NoRain
2,2014-04-01 00:21:00,40.7316,-73.9873,NoRain
3,2014-04-01 00:28:00,40.7588,-73.9776,NoRain
4,2014-04-01 00:33:00,40.7594,-73.9722,NoRain


In [47]:
df.shape

(1880795, 5)

## Find Neighborhood Tabulation Areas (NTA name) for each trip record with Geopandas

In [63]:
import geopandas as gp
from shapely.geometry import Polygon, Pointx

ImportError: cannot import name 'Pointx' from 'shapely.geometry' (/Users/zheng/anaconda3/lib/python3.7/site-packages/shapely/geometry/__init__.py)

In [34]:
FRC = gp.read_file('nynta_19d/nynta.shp')
geometry = [Point(xy) for xy in zip(df['Lon'], df['Lat'])]
test = gp.GeoDataFrame(df, geometry=geometry)

In [38]:
test.crs={'init':'epsg:4326', 'no_defs':True}
test=test.to_crs(epsg=2263)
test

Unnamed: 0,Date/Time,Lat,Lon,weather,geometry
0,2014-04-01 00:11:00,40.7690,-73.9549,NoRain,POINT (996742.422 219447.424)
1,2014-04-01 00:17:00,40.7267,-74.0345,NoRain,POINT (974687.643 204034.843)
2,2014-04-01 00:21:00,40.7316,-73.9873,NoRain,POINT (987769.797 205818.438)
3,2014-04-01 00:28:00,40.7588,-73.9776,NoRain,POINT (990455.612 215728.806)
4,2014-04-01 00:33:00,40.7594,-73.9722,NoRain,POINT (991951.539 215947.834)
5,2014-04-01 00:33:00,40.7383,-74.0403,NoRain,POINT (973082.003 208261.772)
6,2014-04-01 00:39:00,40.7223,-73.9887,NoRain,POINT (987382.225 202430.106)
7,2014-04-01 00:45:00,40.7620,-73.9790,NoRain,POINT (990067.482 216894.574)
8,2014-04-01 00:55:00,40.7524,-73.9960,NoRain,POINT (985358.252 213396.309)
9,2014-04-01 01:01:00,40.7575,-73.9846,NoRain,POINT (988516.442 215254.755)


In [48]:
test.shape

(1880795, 5)

In [40]:
pointInPolys = gp.sjoin(test, FRC, how='left', op='within')

Unnamed: 0,Date/Time,Lat,Lon,weather,geometry,index_right,BoroCode,BoroName,CountyFIPS,NTACode,NTAName,Shape_Leng,Shape_Area
0,2014-04-01 00:11:00,40.7690,-73.9549,NoRain,POINT (996742.422 219447.424),169.0,1.0,Manhattan,061,MN31,Lenox Hill-Roosevelt Island,39424.225707,2.150862e+07
1,2014-04-01 00:17:00,40.7267,-74.0345,NoRain,POINT (974687.643 204034.843),,,,,,,,
2,2014-04-01 00:21:00,40.7316,-73.9873,NoRain,POINT (987769.797 205818.438),120.0,1.0,Manhattan,061,MN22,East Village,13555.239451,1.089432e+07
3,2014-04-01 00:28:00,40.7588,-73.9776,NoRain,POINT (990455.612 215728.806),82.0,1.0,Manhattan,061,MN17,Midtown-Midtown South,27032.700224,3.019153e+07
4,2014-04-01 00:33:00,40.7594,-73.9722,NoRain,POINT (991951.539 215947.834),167.0,1.0,Manhattan,061,MN19,Turtle Bay-East Midtown,21124.054187,1.738916e+07
5,2014-04-01 00:33:00,40.7383,-74.0403,NoRain,POINT (973082.003 208261.772),,,,,,,,
6,2014-04-01 00:39:00,40.7223,-73.9887,NoRain,POINT (987382.225 202430.106),77.0,1.0,Manhattan,061,MN27,Chinatown,20786.256110,1.450187e+07
7,2014-04-01 00:45:00,40.7620,-73.9790,NoRain,POINT (990067.482 216894.574),82.0,1.0,Manhattan,061,MN17,Midtown-Midtown South,27032.700224,3.019153e+07
8,2014-04-01 00:55:00,40.7524,-73.9960,NoRain,POINT (985358.252 213396.309),122.0,1.0,Manhattan,061,MN13,Hudson Yards-Chelsea-Flatiron-Union Square,45693.074599,3.706838e+07
9,2014-04-01 01:01:00,40.7575,-73.9846,NoRain,POINT (988516.442 215254.755),82.0,1.0,Manhattan,061,MN17,Midtown-Midtown South,27032.700224,3.019153e+07


In [42]:
pointInPolys.head(3)

Unnamed: 0,Date/Time,Lat,Lon,weather,geometry,index_right,BoroCode,BoroName,CountyFIPS,NTACode,NTAName,Shape_Leng,Shape_Area
0,2014-04-01 00:11:00,40.769,-73.9549,NoRain,POINT (996742.422 219447.424),169.0,1.0,Manhattan,61.0,MN31,Lenox Hill-Roosevelt Island,39424.225707,21508620.0
1,2014-04-01 00:17:00,40.7267,-74.0345,NoRain,POINT (974687.643 204034.843),,,,,,,,
2,2014-04-01 00:21:00,40.7316,-73.9873,NoRain,POINT (987769.797 205818.438),120.0,1.0,Manhattan,61.0,MN22,East Village,13555.239451,10894320.0


In [43]:
pointInPolys.BoroName.unique()

array(['Manhattan', nan, 'Queens', 'Brooklyn', 'Bronx', 'Staten Island'],
      dtype=object)

In [53]:
pointInPolys.NTAName.nunique()

192

In [32]:
df.head(3)

Unnamed: 0,Date/Time,Lat,Lon,weather
0,2014-04-01 00:11:00,40.769,-73.9549,NoRain
1,2014-04-01 00:17:00,40.7267,-74.0345,NoRain
2,2014-04-01 00:21:00,40.7316,-73.9873,NoRain


In [41]:
FRC.head(3)

Unnamed: 0,BoroCode,BoroName,CountyFIPS,NTACode,NTAName,Shape_Leng,Shape_Area,geometry
0,3,Brooklyn,47,BK88,Borough Park,39247.227831,54005020.0,"POLYGON ((990897.900 169268.121, 990588.252 16..."
1,4,Queens,81,QN51,Murray Hill,33266.904797,52488280.0,"POLYGON ((1038593.459 221913.355, 1039369.281 ..."
2,4,Queens,81,QN27,East Elmhurst,19816.712074,19726850.0,"POLYGON ((1022728.275 217530.808, 1023052.645 ..."


In [31]:
FRC.BoroName.unique()

array(['Brooklyn', 'Queens', 'Manhattan', 'Bronx', 'Staten Island'],
      dtype=object)

In [49]:
pointInPolys.shape

(1880795, 13)

In [50]:
pointInPolys.head(3)

Unnamed: 0,Date/Time,Lat,Lon,weather,geometry,index_right,BoroCode,BoroName,CountyFIPS,NTACode,NTAName,Shape_Leng,Shape_Area
0,2014-04-01 00:11:00,40.769,-73.9549,NoRain,POINT (996742.422 219447.424),169.0,1.0,Manhattan,61.0,MN31,Lenox Hill-Roosevelt Island,39424.225707,21508620.0
1,2014-04-01 00:17:00,40.7267,-74.0345,NoRain,POINT (974687.643 204034.843),,,,,,,,
2,2014-04-01 00:21:00,40.7316,-73.9873,NoRain,POINT (987769.797 205818.438),120.0,1.0,Manhattan,61.0,MN22,East Village,13555.239451,10894320.0


In [54]:
pointInPolys.isnull().sum()

Date/Time          0
Lat                0
Lon                0
weather            0
geometry           0
index_right    46666
BoroCode       46666
BoroName       46666
CountyFIPS     46666
NTACode        46666
NTAName        46666
Shape_Leng     46666
Shape_Area     46666
dtype: int64

In [61]:
final_data = pointInPolys.dropna()[['Date/Time', 'weather', 'NTAName']]
final_data

Unnamed: 0,Date/Time,weather,NTAName
0,2014-04-01 00:11:00,NoRain,Lenox Hill-Roosevelt Island
2,2014-04-01 00:21:00,NoRain,East Village
3,2014-04-01 00:28:00,NoRain,Midtown-Midtown South
4,2014-04-01 00:33:00,NoRain,Turtle Bay-East Midtown
6,2014-04-01 00:39:00,NoRain,Chinatown
7,2014-04-01 00:45:00,NoRain,Midtown-Midtown South
8,2014-04-01 00:55:00,NoRain,Hudson Yards-Chelsea-Flatiron-Union Square
9,2014-04-01 01:01:00,NoRain,Midtown-Midtown South
10,2014-04-01 01:19:00,NoRain,East Village
11,2014-04-01 01:48:00,NoRain,Turtle Bay-East Midtown


In [64]:
pointInPolys.to_csv('final-uber-trip-data.csv')