## Data extractions cont..

## Step 2 : Using the master list of id, pull attributes for each bike id
## Step 3 : Using Python Wrapper pygeocoder to get geographical attributes for a given lat, long


In [1]:
import numpy as np
import pandas as pd
#package for JSON interaction
import json
#package to interact with URL's
import urllib2

#package to using google api
import pygeocoder as geo

#package to get nearby places
import googleplaces as places

#time to sleep
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

In [2]:
#loading the master list of id
id_master = pd.read_csv('bike_id_master.csv')

In [3]:
#removing unecessary columns
id_master = id_master.drop('Unnamed: 0',axis=1)

In [17]:
#length check
len(id_master.id.unique())

59624

The below section was added as a fail safe mechanism as this process of pulling data is time consuming. This step basically just makes a list of id's yet to be pulled and resumes data pull for just those id's.

In [2]:
#df_id_loaded = pd.read_csv("bike_details_csv.csv")
#id_loaded = df_id_loaded.id.unique()
#id = set(id_master.id) - set(id_loaded)
#id_df = pd.DataFrame(pd.Series(list(id),name='id'))
#sorting column to maintain integrity
#id_df.sort(columns='id',inplace=True)

The below function takes in a url for each bike id and parses all attributes for it from bikeindex.

In [12]:
#function to parse 
# have to handle missing stolen record case
def bike_id_parser(url):
    #commenting sleeper as lot of processing must add sometime
    #time.sleep(0.5)
    #open url
    f = urllib2.urlopen(url)

    #load json from URL, dict returned
    data = json.load(f)
    
    
    data_bikes = data['bike']
    bike_df = pd.DataFrame([data_bikes],columns=['id','title','serial','manufacturer_name','frame_model','year','thumb','large_img','is_stock_img','stolen','stolen_location','date_stolen','registration_created_at','registration_updated_at','url','api_url','manufacturer_id','paint_description','name','frame_size','description','rear_tire_narrow','front_tire_narrow','type_of_cycle','test_bike','rear_wheel_size_iso_bsd','front_wheel_size_iso_bsd','handlebar_type_slug','frame_material_slug','front_gear_type_slug','rear_gear_type_slug'])
    bike_df['frame_colors'] = str(data_bikes['frame_colors'])
    
    if (bike_df.shape[0] > 1):
        print '%s creating multiple rows in df'%(url)

    stolen_dict = data_bikes['stolen_record']
    if stolen_dict <> None:
        bike_stolen_df = pd.DataFrame([stolen_dict])
        bike_stolen_df = bike_stolen_df.rename(columns = {'id':'stolen_id','date_stolen':'s_date_stolen','location':'s_location','created_at':'s_rec_created_at'})
        
        #using pygeocoder to get stolen location attributes based on lat, long
        #der stands for derived
        lat = bike_stolen_df.latitude.values[0]
        lng = bike_stolen_df.longitude.values[0]
        if (lat <> None and lng <> None):
            results = geo.Geocoder.reverse_geocode(float(lat), float(lng))
            bike_stolen_df['sder_formatted_address'] = results.formatted_address
            bike_stolen_df['sder_street_number'] = results.street_number
            bike_stolen_df['sder_route'] = results.route
            bike_stolen_df['sder_postal_code'] = results.postal_code
            bike_stolen_df['sder_neighborhood'] = results.neighborhood
            bike_stolen_df['sder_city'] = results.city
            bike_stolen_df['sder_county'] = results.county
            bike_stolen_df['sder_state'] = results.state
            bike_stolen_df['sder_country'] = results.country
        else:
            bike_stolen_df['sder_formatted_address'] = None
            bike_stolen_df['sder_street_number'] = None
            bike_stolen_df['sder_route'] = None
            bike_stolen_df['sder_postal_code'] = None
            bike_stolen_df['sder_neighborhood'] = None
            bike_stolen_df['sder_city'] = None
            bike_stolen_df['sder_county'] = None
            bike_stolen_df['sder_state'] = None
            bike_stolen_df['sder_country'] = None    
    else:
        #define correct order with rename
        bike_stolen_df = pd.DataFrame(columns=['stolen_id','s_date_stolen','s_location','latitude','longitude','theft_description','locking_description','lock_defeat_description','police_report_number','police_report_department','s_rec_created_at','create_open311','sder_formatted_address','sder_street_number','sder_route','sder_postal_code','sder_neighborhood','sder_city','sder_county','sder_state','sder_country'])
    
    #df to concat vertically
    li = [bike_df,bike_stolen_df]
    final_df = pd.concat(li,axis=1)
    
    final_df['date_stolen'].fillna(0, inplace=True)
    final_df['s_rec_created_at'].fillna(0, inplace=True)
    #creating copy of epoch dates
    final_df.loc[:,'date_stolen_epoch']  = final_df.loc[:,'date_stolen']
    final_df.loc[:,'registration_created_at_epoch'] = final_df.loc[:,'registration_created_at']
    final_df.loc[:,'registration_updated_at_epoch'] = final_df.loc[:,'registration_updated_at']
    final_df.loc[:,'s_rec_created_at_epoch'] = final_df.loc[:,'s_rec_created_at']
    
    
    
    #converting to actual dates
    
    final_df.loc[:,'date_stolen'] = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime(final_df.loc[:,'date_stolen']))
    final_df.loc[:,'registration_created_at'] = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime(final_df.loc[:,'registration_created_at']))
    final_df.loc[:,'registration_updated_at'] = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime(final_df.loc[:,'registration_updated_at']))
    final_df.loc[:,'s_rec_created_at'] = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime(final_df.loc[:,'s_rec_created_at']))

    
    ##bike_df['url'] = url.encode('utf-8')
    ##bike_df['page_no'] = page_no

    return final_df


The below piece of code loops through the master list of id's

In [13]:
col_order = ['id','title','serial','manufacturer_name','frame_model','year','thumb','large_img','is_stock_img','stolen','stolen_location','date_stolen','registration_created_at','registration_updated_at','url','api_url','manufacturer_id','paint_description','name','frame_size','description','rear_tire_narrow','front_tire_narrow','type_of_cycle','test_bike','rear_wheel_size_iso_bsd','front_wheel_size_iso_bsd','handlebar_type_slug','frame_material_slug','front_gear_type_slug','rear_gear_type_slug','stolen_id','s_date_stolen','s_location','latitude','longitude','theft_description','locking_description','lock_defeat_description','police_report_number','police_report_department','s_rec_created_at','create_open311','sder_formatted_address','sder_street_number','sder_route','sder_postal_code','sder_neighborhood','sder_city','sder_county','sder_state','sder_country','date_stolen_epoch','registration_created_at_epoch','registration_updated_at_epoch','s_rec_created_at_epoch']
#uncomment if running from scratch
bike_df = pd.DataFrame(columns=col_order)
#variable defining the order of fields
error_dict = {}
try:
    for i,bike_id in enumerate(id_df.id):
        url = 'https://bikeindex.org:443/api/v2/bikes/' + str(bike_id)

        #dataframe with 100 bikes that was read from url
        temp_bike_df = bike_id_parser(url)
    
        if temp_bike_df.shape[0] == 0:
            print 'record # = %d, bike id = %d'%(i,bike_id)
            print 'url returned zero records'
            break
    
        with open('bike_details_csv.csv', 'a') as f:
            if bike_df.shape[0] == 0:
                temp_bike_df.to_csv(f, header=True,encoding='utf8',columns=col_order)
                bike_df = bike_df.append(temp_bike_df,ignore_index=True)
            else:
                bike_df = bike_df.append(temp_bike_df,ignore_index=True)
                temp_bike_df.to_csv(f, header=False,encoding='utf8',columns=col_order)
    
        #inserting print to get an indication of number of pages parsed so far
        rec_no = i + 1
        if rec_no%100 == 0:
            print('Records parsed = %d'%(rec_no))
except Exception,e:
    error_dict[bike_id] = e
    print error_dict



Records parsed = 100
Records parsed = 200
Records parsed = 300
Records parsed = 400
Records parsed = 500
Records parsed = 600
Records parsed = 700
Records parsed = 800
Records parsed = 900
Records parsed = 1000
Records parsed = 1100
Records parsed = 1200
Records parsed = 1300
Records parsed = 1400
Records parsed = 1500
Records parsed = 1600
Records parsed = 1700


Once the data is pulled it is cleaned and written onto a CSV file.

In [27]:
#removing unecessary columns
df = df.drop('Unnamed: 0',axis=1)

with open('bike_details_csv.csv', 'a') as f:
    df.to_csv(f, header=True,encoding='utf8',columns=col_order)