## Step 1 : Pull id's of all bike data available on BikeIndex.Org

In [1]:
import numpy as np
import pandas as pd
#package for JSON interaction
import json
#package to interact with URL's
import urllib2

#time to sleep
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

The below function takes a bikeindex api url and page number and extracts all required attirbutes from it

In [2]:
#function to pull bike index data
def bike_parser(url,page_no):
    time.sleep(0.5)
    #open url
    f = urllib2.urlopen(url)

    #load json from URL, dict returned
    data = json.load(f)
    
    data_bikes = data['bikes']
    bike_df = pd.DataFrame(data_bikes,columns=['id','serial','title','manufacturer_name','frame_model','year','frame_colors','is_stock_img','large_img','thumb','stolen','date_stolen','stolen_location'])
    bike_df['url'] = url.encode('utf-8')
    bike_df['page_no'] = page_no

    return bike_df


Creating blank dataframe to house bike index data that is going to be pulled.

In [30]:
bike_df = pd.DataFrame(columns=['id','serial','title','manufacturer_name','frame_model','year','frame_colors','is_stock_img','large_img','thumb','stolen','date_stolen','stolen_location','url','page_no'])
cont_flag = True
page_no = 1

Next loop through all pages in bikeindex and hold the data in a csv file. This process take about an hour to complete. Also every now and then the api tends to go down and we will have to resume the process.

In [31]:
#Looping through all pages in bikeindex, extract records one page at a time
while(cont_flag):
    url = 'https://bikeindex.org:443/api/v2/bikes_search?page=' + str(page_no) + '&per_page=80'

    #dataframe with 100 bikes that was read from url
    temp_bike_df = bike_parser(url,page_no)
    
    #checking # of records in page
    if (temp_bike_df.shape[0] == 0):
        print 'Reached end of Pages, # of pages read = %d, # of records read = %d'%(page_no,bike_df.shape[0])
        cont_flag = False
        break
    
    with open('bike_csv.csv', 'a') as f:
        if page_no == 1:
            temp_bike_df.to_csv(f, header=True,encoding='utf8')
        else:
            temp_bike_df.to_csv(f, header=False,encoding='utf8')

    #appending temp df to final df
    bike_df = bike_df.append(temp_bike_df,ignore_index=True)
    

    
    #inserting print to get an indication of number of pages parsed so far
    if page_no%10 == 0:
        print('Pages parsed = %d, Records parsed = %d'%(page_no,bike_df.shape[0]))
    
    page_no = page_no + 1


Pages parsed = 10, Records parsed = 800
Pages parsed = 20, Records parsed = 1600
Pages parsed = 30, Records parsed = 2400
Pages parsed = 40, Records parsed = 3200
Pages parsed = 50, Records parsed = 4000
Pages parsed = 60, Records parsed = 4800
Pages parsed = 70, Records parsed = 5600
Pages parsed = 80, Records parsed = 6400
Pages parsed = 90, Records parsed = 7200
Pages parsed = 100, Records parsed = 8000
Pages parsed = 110, Records parsed = 8800
Pages parsed = 120, Records parsed = 9600
Pages parsed = 130, Records parsed = 10400
Pages parsed = 140, Records parsed = 11200
Pages parsed = 150, Records parsed = 12000
Pages parsed = 160, Records parsed = 12800
Pages parsed = 170, Records parsed = 13600
Pages parsed = 180, Records parsed = 14400
Pages parsed = 190, Records parsed = 15200
Pages parsed = 200, Records parsed = 16000
Pages parsed = 210, Records parsed = 16800
Pages parsed = 220, Records parsed = 17600
Pages parsed = 230, Records parsed = 18400
Pages parsed = 240, Records parse

A check was put in place to ensure that number of records extracted was approximately same as what was in the header variable.

In [22]:
#checking if total number of records are read

#calling url to get response
f = urllib2.urlopen('https://bikeindex.org:443/api/v2/bikes_search?page=1')

#read total number of records from header
Total_rec_response = f.info().getheader('Total')

#Both must be equal
print 'Records in Response header %d,\n Records in final bike df %d'%(int(Total_rec_response),bike_df.shape[0])


Records in Response header 59724,
 Records in final bike df 59723


The data had duplicates this is because API did not maintain the set number of records in a page. This set changed everytime a call was made.

In [6]:
len(bike_df.id.unique())

51916

In [82]:
bike_df.to_csv("bike_df.csv",encoding='utf8')


To over come the duplication problem , ran through the above process for different number of records per page and finall combined all datasets eliminating duplicates. This method allowed for getting most of the available data on BikeIndex.

In [32]:
#api does not follow the same order during searches, to address this pulled data 25,75,100,90 per page with the intention with multiple
#pulls we will be able to get more id's
bike1 = pd.read_csv('old_bike_csv.csv')
bike2 = pd.read_csv('bike_csv_100.csv')
bike3 = pd.read_csv('bike_csv_75.csv')
bike4 = pd.read_csv('bike_csv_90.csv')
bike5 = pd.read_csv('bike_csv_80.csv')

In [34]:
id1 = bike1.id.unique().tolist()
id2 = bike2.id.unique().tolist()
id3 = bike3.id.unique().tolist()
id4 = bike4.id.unique().tolist()
id5 = bike5.id.unique().tolist()

In [35]:
id1 = [str(i) for i in id1]
id2 = [str(i) for i in id2]
id3 = [str(i) for i in id3]
id4 = [str(i) for i in id4]
id5 = [str(i) for i in id5]

In [36]:
id_f = id1
id_f.extend(id2)
id_f.extend(id3)
id_f.extend(id4)


Final Set of id's obtained from bikeindex.

In [40]:
#checking final list of records obtained.
len(set(id_f))

59624

This list of id's is written onto a file which will be used later to get all attributes pertaining to a specific id.

In [42]:
id_s = pd.Series(list(set(id_f)))
id_s.name = 'id'
id_s.to_csv("bike_id_master.csv",header=True)