In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [25]:
trip = pd.read_csv('trip.csv')
station = pd.read_csv('station.csv')
weather = pd.read_csv('weather.csv')

## Exploration

In [4]:
trip.shape

(669959, 11)

In [9]:
trip.head()

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,4576,63,8/29/2013 14:13,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,94127
1,4607,70,8/29/2013 14:42,San Jose City Hall,10,8/29/2013 14:43,San Jose City Hall,10,661,Subscriber,95138
2,4130,71,8/29/2013 10:16,Mountain View City Hall,27,8/29/2013 10:17,Mountain View City Hall,27,48,Subscriber,97214
3,4251,77,8/29/2013 11:29,San Jose City Hall,10,8/29/2013 11:30,San Jose City Hall,10,26,Subscriber,95060
4,4299,83,8/29/2013 12:02,South Van Ness at Market,66,8/29/2013 12:04,Market at 10th,67,319,Subscriber,94103


In [10]:
station.head()

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
3,5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013


In [11]:
weather.head()

Unnamed: 0,date,max_temperature_f,mean_temperature_f,min_temperature_f,max_dew_point_f,mean_dew_point_f,min_dew_point_f,max_humidity,mean_humidity,min_humidity,...,mean_visibility_miles,min_visibility_miles,max_wind_Speed_mph,mean_wind_speed_mph,max_gust_speed_mph,precipitation_inches,cloud_cover,events,wind_dir_degrees,zip_code
0,8/29/2013,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,57.0,...,10.0,10.0,23.0,11.0,28.0,0,4.0,,286.0,94107
1,8/30/2013,78.0,69.0,60.0,61.0,58.0,56.0,90.0,70.0,50.0,...,10.0,7.0,29.0,13.0,35.0,0,2.0,,291.0,94107
2,8/31/2013,71.0,64.0,57.0,57.0,56.0,54.0,93.0,75.0,57.0,...,10.0,10.0,26.0,15.0,31.0,0,4.0,,284.0,94107
3,9/1/2013,74.0,66.0,58.0,60.0,56.0,53.0,87.0,68.0,49.0,...,10.0,10.0,25.0,13.0,29.0,0,4.0,,284.0,94107
4,9/2/2013,75.0,69.0,62.0,61.0,60.0,58.0,93.0,77.0,61.0,...,10.0,6.0,23.0,12.0,30.0,0,6.0,,277.0,94107


In [13]:
#check if id is unique
result = trip.groupby('id')['start_date'].count().sort_values(ascending = False)
#all equal to 1, is unique
result.head()

id
913460    1
305542    1
305569    1
305568    1
305567    1
Name: start_date, dtype: int64

Check for missing values

In [8]:
trip.isnull().sum()

id                       0
duration                 0
start_date               0
start_station_name       0
start_station_id         0
end_date                 0
end_station_name         0
end_station_id           0
bike_id                  0
subscription_type        0
zip_code              6619
dtype: int64

In [15]:
weather.isnull().sum()

date                                 0
max_temperature_f                    4
mean_temperature_f                   4
min_temperature_f                    4
max_dew_point_f                     54
mean_dew_point_f                    54
min_dew_point_f                     54
max_humidity                        54
mean_humidity                       54
min_humidity                        54
max_sea_level_pressure_inches        1
mean_sea_level_pressure_inches       1
min_sea_level_pressure_inches        1
max_visibility_miles                13
mean_visibility_miles               13
min_visibility_miles                13
max_wind_Speed_mph                   1
mean_wind_speed_mph                  1
max_gust_speed_mph                 899
precipitation_inches                 1
cloud_cover                          1
events                            3143
wind_dir_degrees                     1
zip_code                             0
dtype: int64

## Merge Data

In [36]:
trip.dtypes

id                     int64
duration               int64
start_date            object
start_station_name    object
start_station_id       int64
end_date              object
end_station_name      object
end_station_id         int64
bike_id                int64
subscription_type     object
zip_code              object
dtype: object

In [66]:
station.dtypes

id                     int64
name                  object
lat                  float64
long                 float64
dock_count             int64
city                  object
installation_date     object
dtype: object

In [67]:
weather.dtypes

date                               object
max_temperature_f                 float64
mean_temperature_f                float64
min_temperature_f                 float64
max_dew_point_f                   float64
mean_dew_point_f                  float64
min_dew_point_f                   float64
max_humidity                      float64
mean_humidity                     float64
min_humidity                      float64
max_sea_level_pressure_inches     float64
mean_sea_level_pressure_inches    float64
min_sea_level_pressure_inches     float64
max_visibility_miles              float64
mean_visibility_miles             float64
min_visibility_miles              float64
max_wind_Speed_mph                float64
mean_wind_speed_mph               float64
max_gust_speed_mph                float64
precipitation_inches               object
cloud_cover                       float64
events                             object
wind_dir_degrees                  float64
zip_code                          

In [3]:
###Transform trip data
##Get rid of customer zipcode since it is not realiable and will not be used for analysis
df1 = trip.drop(columns = ['zip_code'])
##Transform start and end date to datetime objects
df1['start_date'] = df1['start_date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M'))
df1['end_date'] = df1['end_date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M'))
##Extracc only y/m/d to join with weather data later on
df1['date_for_join'] = df1['start_date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df1['date_for_join'] = df1['date_for_join'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [5]:
###Join trip data with station data
##Transform the "city" column into corrsponding zipcode
##As author of the dataset suggests, 94107=San Francisco, 94063=Redwood City, 94301=Palo Alto, 94041=Mountain View, 95113= San Jose
city_zip = pd.DataFrame({'city': ['San Jose', 'Redwood City', 'Mountain View', 'Palo Alto','San Francisco'], \
                         'zip_for_join': [95113,94063,94041,94301,94107]})
merge1 = station.merge(city_zip, how = 'left', left_on = 'city', right_on = 'city')

In [6]:
merge2 = merge1.copy()
merge2.columns = ['start_station_id','start_name','start_lat','start_long','start_dock_count','start_city','start_installation_date','start_zip']

In [7]:
merge3 = merge1.copy()
merge3.columns =  ['end_station_id','end_name','end_lat','end_long','end_dock_count','end_city','end_installation_date','end_zip']

In [40]:
merge4 = df1.merge(merge2, how = 'left', left_on = 'start_station_id',right_on = 'start_station_id')

In [41]:
merge5 = merge4.merge(merge3,how = 'left', left_on = 'end_station_id',right_on = 'end_station_id' )

In [42]:
merge6 = merge5.drop(columns = ['start_name','end_name'])

In [45]:
###Transform Weather data
##Transform date to date object
weather['date'] = weather['date'].apply(lambda x: datetime.strptime(x,'%m/%d/%Y'))

TypeError: strptime() argument 1 must be str, not Timestamp

In [46]:
start_weather = weather.copy()
columns = list(start_weather.columns)
new_columns = []
for i in columns:
    i = 'start_' + i
    new_columns.append(i)
start_weather.columns = new_columns

In [50]:
end_weather = weather.copy()
columns = list(end_weather.columns)
new_columns = []
for i in columns:
    i = 'end_' + i
    new_columns.append(i)
end_weather.columns = new_columns

In [47]:
merge7 = merge6.merge(start_weather, how = 'left', left_on = ['date_for_join','start_zip'], \
                      right_on = ['start_date','start_zip_code'])

In [53]:
merge8 = merge7.merge(end_weather,how = 'left', left_on = ['date_for_join','end_zip'], \
                      right_on = ['end_date','end_zip_code'])

In [54]:
merge8.head(5).transpose()

Unnamed: 0,0,1,2,3,4
id,4576,4607,4130,4251,4299
duration,63,70,71,77,83
start_date_x,2013-08-29 14:13:00,2013-08-29 14:42:00,2013-08-29 10:16:00,2013-08-29 11:29:00,2013-08-29 12:02:00
start_station_name,South Van Ness at Market,San Jose City Hall,Mountain View City Hall,San Jose City Hall,South Van Ness at Market
start_station_id,66,10,27,10,66
end_date_x,2013-08-29 14:14:00,2013-08-29 14:43:00,2013-08-29 10:17:00,2013-08-29 11:30:00,2013-08-29 12:04:00
end_station_name,South Van Ness at Market,San Jose City Hall,Mountain View City Hall,San Jose City Hall,Market at 10th
end_station_id,66,10,27,10,67
bike_id,520,661,48,26,319
subscription_type,Subscriber,Subscriber,Subscriber,Subscriber,Subscriber


In [57]:
merge9 = merge8.drop(columns = ['end_zip_code','end_date_y','start_date_y',\
                                'start_zip_code','date_for_join'])
merge9.rename(columns={'start_date_x':'start_date','end_date_x':'end_date'}, inplace=True)

## Clean merged sets

In [73]:
merge9.shape

(669959, 66)

In [83]:
na_list = pd.DataFrame(merge9.isnull().sum())
na_list['column_name'] = na_list.index
na_list.columns = ['count_na','column_name']
na_column = na_list[na_list['count_na']>0]

In [84]:
na_column.sort_values(by = 'count_na')

Unnamed: 0,count_na,column_name
start_wind_dir_degrees,14,start_wind_dir_degrees
end_precipitation_inches,14,end_precipitation_inches
end_mean_wind_speed_mph,14,end_mean_wind_speed_mph
end_max_wind_Speed_mph,14,end_max_wind_Speed_mph
end_min_sea_level_pressure_inches,14,end_min_sea_level_pressure_inches
end_mean_sea_level_pressure_inches,14,end_mean_sea_level_pressure_inches
end_max_sea_level_pressure_inches,14,end_max_sea_level_pressure_inches
end_cloud_cover,14,end_cloud_cover
start_cloud_cover,14,start_cloud_cover
start_precipitation_inches,14,start_precipitation_inches


In [75]:
merge9['start_events'] = merge9['start_events'].fillna('no special events')

In [78]:
merge9['end_events'] = merge9['end_events'].fillna('no special events')

In [82]:
merge9 = merge9.drop(columns = ['start_max_gust_speed_mph','end_max_gust_speed_mph'])

In [93]:
## Removing 0.07% of records that has missing values
merge10 = merge9.dropna()

In [95]:
merge10.shape

(669431, 64)

In [98]:
merge10.to_csv('SF_Bay_Area_Bike_Share_Data_Cleaned.csv', index = False)