## csv cleaning code
this code will take in the csv files, then transform and merge them so that the end product is a single dataframe with:
- datetime
- pickup longitude
- pickup latitude 
- pickup zip code
- service (uber, yellow cab, or green cab)
- pickup day of week
- pickup hour

The code was built off of a small set of the data that will be fed into the final system but it should scale up to work with the full data.

In [18]:
# Load libraries

from datetime import datetime, date, timedelta
import csv
import pandas as pd 
import plotly.graph_objs as go
import plotly.plotly as py
import numpy as np
py.sign_in('ajbentley', 'zjfu2vasav')


In [19]:
# Load the data

uber = pd.read_csv("../../3.0-hackathon/uber-files/uber-raw-apr-sep14-1000.csv")
green = pd.read_csv("../../3.0-hackathon/uber-files/2014_Green_Taxi_Trip_Data-1000.csv")
yellow = pd.read_csv("../../3.0-hackathon/uber-files/nyc_yellow_apr-sep14_5percent-1000.csv")

In [20]:
# see general information about the uber file

print uber.info()
print ' '
print 'describe'
print uber.describe()
uber.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
Date/Time    999 non-null object
Lat          999 non-null float64
Lon          999 non-null float64
Base         999 non-null object
dtypes: float64(2), object(2)
memory usage: 31.3+ KB
None
 
describe
              Lat         Lon
count  999.000000  999.000000
mean    40.748495  -73.983836
std      0.034131    0.062267
min     40.608600  -74.420000
25%     40.728550  -73.999100
50%     40.753800  -73.983500
75%     40.767450  -73.969850
max     40.985900  -73.420200


Unnamed: 0,Date/Time,Lat,Lon,Base
0,6/10/14 0:11,40.769,-73.9549,B02512
1,6/27/14 0:17,40.7267,-74.0345,B02512
2,5/31/14 0:21,40.7316,-73.9873,B02512
3,4/24/14 0:28,40.7588,-73.9776,B02512
4,4/17/14 0:33,40.7594,-73.9722,B02512


In [21]:
# see general information about the green file

print green.info()
print ' '
print 'describe'
print green.describe()
green.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 3 columns):
pickup_datetime     998 non-null object
Pickup_longitude    998 non-null float64
Pickup_latitude     998 non-null float64
dtypes: float64(2), object(1)
memory usage: 23.5+ KB
None
 
describe
       Pickup_longitude  Pickup_latitude
count        998.000000       998.000000
mean         -73.928581        40.746655
std            0.044047         0.054688
min          -74.149689        40.576584
25%          -73.957966        40.711702
50%          -73.941921        40.745770
75%          -73.903019        40.789987
max          -73.807678        40.903412


Unnamed: 0,pickup_datetime,Pickup_longitude,Pickup_latitude
0,7/7/14 0:42,-74.149689,40.903412
1,5/5/14 16:24,-74.028282,40.630142
2,6/1/14 3:52,-74.027367,40.622341
3,5/7/14 2:16,-74.026443,40.629539
4,6/25/14 20:01,-74.026405,40.629505


In [22]:
# see general information about the yellow file

print yellow.info()
print ' '
print 'describe'
print yellow.describe()
yellow.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
pickup_datetime     1000 non-null object
Pickup_longitude    1000 non-null float64
Pickup_latitude     1000 non-null float64
dtypes: float64(2), object(1)
memory usage: 23.5+ KB
None
 
describe
       Pickup_longitude  Pickup_latitude
count       1000.000000      1000.000000
mean         -72.716901        40.058339
std            9.567608         5.270644
min          -74.108024         0.000000
25%          -73.992579        40.734042
50%          -73.981919        40.754227
75%          -73.964593        40.767980
max            0.000000        40.986185


Unnamed: 0,pickup_datetime,Pickup_longitude,Pickup_latitude
0,4/17/14 23:52,-73.986722,40.719552
1,6/26/14 23:26,-73.976365,40.752602
2,4/15/14 21:04,-73.931985,40.744907
3,9/18/14 6:29,-73.9858,40.77797
4,7/26/14 1:55,-74.004337,40.721423


In [23]:
# clean uber
# remove extraneous column
uber.pop('Base');

# reorder and rename columns
uber = uber[['Date/Time','Lon','Lat']]
uber.rename (columns={'Date/Time':'pu_date_time', 'Lon':'pu_lon',\
                      'Lat':'pu_lat'},\
            inplace=True)

# convert first column to datetime
uber.pu_date_time = pd.to_datetime(uber.pu_date_time)

# add zip code column
uber['pu_zip'] = np.random.choice(range(10001, 11499), uber.shape[0],\
                                 np.random.seed(43))

# add service column
uber['service'] = 'uber'

uber.head()

Unnamed: 0,pu_date_time,pu_lon,pu_lat,pu_zip,service
0,2014-06-10 00:11:00,-73.9549,40.769,10549,uber
1,2014-06-27 00:17:00,-74.0345,40.7267,10467,uber
2,2014-05-31 00:21:00,-73.9873,40.7316,11325,uber
3,2014-04-24 00:28:00,-73.9776,40.7588,11099,uber
4,2014-04-17 00:33:00,-73.9722,40.7594,10436,uber


In [24]:
# break out day of week and hour from datetime

uberdow = []
uberhour = []
uberdowhour = []

for n in uber.pu_date_time:
    uberdow.append(n.dayofweek)
    uberhour.append(n.hour)
    uberdowhour.append((n.dayofweek, n.hour))
    
pu_dow = pd.Series(uberdow)
pu_hour = pd.Series(uberhour)
pu_dowhour = pd.Series(uberdowhour)

# add dow and hour columns to dataframe

uber = pd.concat([uber, pu_dow], axis = 1)
uber.rename(columns={0:'pu_dow'}, inplace=True)
uber = pd.concat([uber, pu_hour], axis = 1)
uber.rename(columns={0:'pu_hour'}, inplace=True)
uber = pd.concat([uber, pu_dowhour], axis = 1)
uber.rename(columns={0:'pu_dowhour'}, inplace=True)


uber.head()

Unnamed: 0,pu_date_time,pu_lon,pu_lat,pu_zip,service,pu_dow,pu_hour,pu_dowhour
0,2014-06-10 00:11:00,-73.9549,40.769,10549,uber,1,0,"(1, 0)"
1,2014-06-27 00:17:00,-74.0345,40.7267,10467,uber,4,0,"(4, 0)"
2,2014-05-31 00:21:00,-73.9873,40.7316,11325,uber,5,0,"(5, 0)"
3,2014-04-24 00:28:00,-73.9776,40.7588,11099,uber,3,0,"(3, 0)"
4,2014-04-17 00:33:00,-73.9722,40.7594,10436,uber,3,0,"(3, 0)"


In [25]:
green.head()

Unnamed: 0,pickup_datetime,Pickup_longitude,Pickup_latitude
0,7/7/14 0:42,-74.149689,40.903412
1,5/5/14 16:24,-74.028282,40.630142
2,6/1/14 3:52,-74.027367,40.622341
3,5/7/14 2:16,-74.026443,40.629539
4,6/25/14 20:01,-74.026405,40.629505


In [26]:
# clean green

# rename columns
green.rename (columns={'pickup_datetime':'pu_date_time',\
                       'Pickup_longitude':'pu_lon','Pickup_latitude':'pu_lat'},\
            inplace=True)

# convert first column to datetime
green.pu_date_time = pd.to_datetime(green.pu_date_time)

# add zip code column
green['pu_zip'] = np.random.choice(range(10001, 11499), green.shape[0],\
                                 np.random.seed(23))

# add service company column
green['service'] = 'green'

green.head()

Unnamed: 0,pu_date_time,pu_lon,pu_lat,pu_zip,service
0,2014-07-07 00:42:00,-74.149689,40.903412,10445,green
1,2014-05-05 16:24:00,-74.028282,40.630142,10455,green
2,2014-06-01 03:52:00,-74.027367,40.622341,10597,green
3,2014-05-07 02:16:00,-74.026443,40.629539,10444,green
4,2014-06-25 20:01:00,-74.026405,40.629505,11158,green


In [27]:
# break out day of week and hour from datetime

greendow = []
greenhour = []
greendowhour = []

for n in green.pu_date_time:
    greendow.append(n.dayofweek)
    greenhour.append(n.hour)
    greendowhour.append((n.dayofweek, n.hour))
    
pu_dow = pd.Series(greendow)
pu_hour = pd.Series(greenhour)
pu_dowhour= pd.Series(greendowhour)

# add dow and hour columns to dataframe

green = pd.concat([green, pu_dow], axis = 1)
green.rename(columns={0:'pu_dow'}, inplace=True)
green = pd.concat([green, pu_hour], axis = 1)
green.rename(columns={0:'pu_hour'}, inplace=True)
green = pd.concat([green, pu_dowhour], axis = 1)
green.rename(columns={0:'pu_dowhour'}, inplace=True)

green.head()

Unnamed: 0,pu_date_time,pu_lon,pu_lat,pu_zip,service,pu_dow,pu_hour,pu_dowhour
0,2014-07-07 00:42:00,-74.149689,40.903412,10445,green,0,0,"(0, 0)"
1,2014-05-05 16:24:00,-74.028282,40.630142,10455,green,0,16,"(0, 16)"
2,2014-06-01 03:52:00,-74.027367,40.622341,10597,green,6,3,"(6, 3)"
3,2014-05-07 02:16:00,-74.026443,40.629539,10444,green,2,2,"(2, 2)"
4,2014-06-25 20:01:00,-74.026405,40.629505,11158,green,2,20,"(2, 20)"


In [28]:
# clean yellow

### will need to change when get real data
yellow.rename(columns={'pickup_datetime':'pu_date_time',\
                       'Pickup_longitude':'pu_lon','Pickup_latitude':'pu_lat'},\
              inplace=True)

# rename columns
yellow.rename (columns={'pickup_datetime':'pu_date_time',\
                       'Pickup_longitude':'pu_lon','Pickup_latitude':'pu_lat'},\
            inplace=True)

# convert first column to datetime
yellow.pu_date_time = pd.to_datetime(yellow.pu_date_time)

# add zip code column
yellow['pu_zip'] = np.random.choice(range(10001, 11499), yellow.shape[0],\
                                 np.random.seed(73))

# add service company column
yellow['service'] = 'yellow'
              
yellow.head()

Unnamed: 0,pu_date_time,pu_lon,pu_lat,pu_zip,service
0,2014-04-17 23:52:00,-73.986722,40.719552,11007,yellow
1,2014-06-26 23:26:00,-73.976365,40.752602,11221,yellow
2,2014-04-15 21:04:00,-73.931985,40.744907,10198,yellow
3,2014-09-18 06:29:00,-73.9858,40.77797,10738,yellow
4,2014-07-26 01:55:00,-74.004337,40.721423,10121,yellow


In [29]:
# break out day of week and hour from datetime

ydow = []
yhour = []
ydowhour = []

for n in yellow.pu_date_time:
    ydow.append(n.dayofweek)
    yhour.append(n.hour)
    ydowhour.append((n.dayofweek, n.hour))
    
pu_dow = pd.Series(ydow)
pu_hour = pd.Series(yhour)
pu_dowhour = pd.Series(ydowhour)


# add dow and hour columns to dataframe

yellow = pd.concat([yellow, pu_dow], axis = 1)
yellow.rename(columns={0:'pu_dow'}, inplace=True)
yellow = pd.concat([yellow, pu_hour], axis = 1)
yellow.rename(columns={0:'pu_hour'}, inplace=True)
yellow = pd.concat([yellow, pu_dowhour], axis = 1)
yellow.rename(columns={0:'pu_dowhour'}, inplace=True)


yellow.head()

Unnamed: 0,pu_date_time,pu_lon,pu_lat,pu_zip,service,pu_dow,pu_hour,pu_dowhour
0,2014-04-17 23:52:00,-73.986722,40.719552,11007,yellow,3,23,"(3, 23)"
1,2014-06-26 23:26:00,-73.976365,40.752602,11221,yellow,3,23,"(3, 23)"
2,2014-04-15 21:04:00,-73.931985,40.744907,10198,yellow,1,21,"(1, 21)"
3,2014-09-18 06:29:00,-73.9858,40.77797,10738,yellow,3,6,"(3, 6)"
4,2014-07-26 01:55:00,-74.004337,40.721423,10121,yellow,5,1,"(5, 1)"


In [30]:
# merge all three files

df = pd.concat((uber, green, yellow), axis=0)


In [31]:
# see general information about the df file

print df.info()
print ' '
print 'describe'
print df.describe()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2997 entries, 0 to 999
Data columns (total 8 columns):
pu_date_time    2997 non-null datetime64[ns]
pu_lon          2997 non-null float64
pu_lat          2997 non-null float64
pu_zip          2997 non-null int64
service         2997 non-null object
pu_dow          2997 non-null int64
pu_hour         2997 non-null int64
pu_dowhour      2997 non-null object
dtypes: datetime64[ns](1), float64(2), int64(3), object(2)
memory usage: 210.7+ KB
None
 
describe
            pu_lon       pu_lat        pu_zip       pu_dow      pu_hour
count  2997.000000  2997.000000   2997.000000  2997.000000  2997.000000
mean    -73.542701    40.517600  10751.247915     3.038705    12.716717
std       5.555832     3.061049    432.323792     1.948906     6.393330
min     -74.420000     0.000000  10001.000000     0.000000     0.000000
25%     -73.988700    40.722878  10379.000000     1.000000     8.000000
50%     -73.972700    40.750637  10757.000000     3.000000   

Unnamed: 0,pu_date_time,pu_lon,pu_lat,pu_zip,service,pu_dow,pu_hour,pu_dowhour
0,2014-06-10 00:11:00,-73.9549,40.769,10549,uber,1,0,"(1, 0)"
1,2014-06-27 00:17:00,-74.0345,40.7267,10467,uber,4,0,"(4, 0)"
2,2014-05-31 00:21:00,-73.9873,40.7316,11325,uber,5,0,"(5, 0)"
3,2014-04-24 00:28:00,-73.9776,40.7588,11099,uber,3,0,"(3, 0)"
4,2014-04-17 00:33:00,-73.9722,40.7594,10436,uber,3,0,"(3, 0)"


In [34]:
# groupby day of week

dow_gb = df.groupby(['pu_dow'])['pu_zip'].count()

df2.head()

pu_dow
0    402
1    393
2    422
3    477
4    470
Name: pu_zip, dtype: int64

In [36]:
# groupby day of week and hour

dowhour_gb = df.groupby(['pu_dowhour'])['pu_zip'].count()

df2.head

<bound method Series.head of pu_dowhour
(0, 0)     14
(0, 1)      9
(0, 2)      7
(0, 3)      7
(0, 4)      7
(0, 5)     13
(0, 6)     11
(0, 7)     16
(0, 8)     15
(0, 9)     27
(0, 10)    14
(0, 11)    19
(0, 12)    17
(0, 13)    23
(0, 14)    27
(0, 15)    20
(0, 16)    16
(0, 17)    26
(0, 18)    25
(0, 19)    23
(0, 20)    17
(0, 21)    19
(0, 22)    20
(0, 23)    10
(1, 0)      8
(1, 1)      5
(1, 2)      6
(1, 3)      6
(1, 4)      6
(1, 5)     12
           ..
(5, 18)    24
(5, 19)    28
(5, 20)    20
(5, 21)    18
(5, 22)    17
(5, 23)    14
(6, 0)     13
(6, 1)     14
(6, 2)     17
(6, 3)     10
(6, 4)      7
(6, 5)     14
(6, 6)     17
(6, 7)     13
(6, 8)     12
(6, 9)     12
(6, 10)    13
(6, 11)    17
(6, 12)    17
(6, 13)    20
(6, 14)    22
(6, 15)    22
(6, 16)    24
(6, 17)    25
(6, 18)    21
(6, 19)    23
(6, 20)    15
(6, 21)    18
(6, 22)    14
(6, 23)    14
Name: pu_zip, dtype: int64>