# Scrape more from autotk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import pickle
from selenium import webdriver
from bs4 import BeautifulSoup as Soup
from urllib.request import urlopen as u_req
from urllib import error
import urllib
import requests
from IPython.core.debugger import set_trace
from permile import get_soup
%matplotlib inline

In [None]:
with open('/Users/ahakso/Documents/gitDir/permileFlask/mysite/static/car_data.pkl','rb') as f:
    car_dict, car_data, _ = pickle.load(f) 

In [None]:
def auto_tk_makepage(url):
    # Get make soup
    try:
        soup = get_soup(url)
    except:
        print('{} doesn''t exist, returning nans'.format(make))
        return None, None

    # Get the names
    names = soup.find_all('h2')
    name = [x.text for x in names][:-1]
    links = [x['href'] for x in soup.select('.make__vehicle li a')]

    # get the long list of vehicle specs objects
    model_box = soup.select('ul.make__vehicle-specs li')
    # Get the index where msrp is located
    idx_msrp = np.nonzero([bool(re.match('MSRP',x.text)) for x in model_box])[0]
    idx_mpg = np.nonzero([bool(re.match('MPG',x.text)) for x in model_box])[0]
    idx_seats = np.nonzero([bool(re.match('Seating Capacity',x.text)) for x in model_box])[0]

    # Get the car index number represented by mpg
    mpg_number = [np.nonzero(x>idx_msrp)[0][-1] for x in idx_mpg]
    # Get the car index number represented by seating
    seats_number = [np.nonzero(x>idx_msrp)[0][-1] for x in idx_seats]

    # process
    combine_mpg = lambda x: (float(x[0])*.55 + float(x[1])*.45)
    msrp = [model_box[x].text for x in idx_msrp]
    msrp = [re.search('(?<=MSRP \$)[\d,]*',x).group(0).replace(',','') for x in msrp]
    mpg = [model_box[x].text for x in idx_mpg]
    mpg = [combine_mpg(re.findall('\d{1,2}',x)) for x in mpg]
    seats =  [model_box[x].text for x in idx_seats]
    seats = [float(re.search('\d{1,2}',x).group(0)) for x in seats]

    # Place in a dataframe
    dftk = pd.DataFrame({'name':name,'msrp':msrp})
    dftk.loc[:,'mpg'] = np.nan
    dftk.loc[:,'seats'] = np.nan
    dftk.mpg.iloc[mpg_number] = mpg
    dftk.seats.iloc[seats_number] = seats
    return dftk, links

In [None]:
def auto_tk_modelpage(url):
    print('Scraping {} ...'.format(url))
    soup = get_soup(url)
    # Get table data
    td = [x.text for x in soup.find_all('td')]
    # Get indices of target cells
    idxzero60 = np.nonzero([bool(re.match('0-60',x)) for x in td])[0]
    idxweight = np.nonzero([bool(re.match('Curb',x)) for x in td])[0]
    # Pull out text of target cells
    zero60 = [td[x] for x in idxzero60]
    weight = [td[x] for x in idxweight]
    # Return floats
    try:
        zero60 = float(re.search('(?<=mph )[\d\.]*',zero60[0]).group(0))
    except IndexError:
        zero60 = np.nan        
    try:
        weight = float(re.search('\d{1,}',weight[0]).group(0))
    except IndexError:
        weight = np.nan
    return zero60, weight

In [None]:
def split_name(df_):
#     Add make and model from name column
    make,model = zip(*df_.name.str.split(' ',1))
    model = [x.strip() for x in model]
    df_.loc[:,'make'] = make
    df_.loc[:,'model'] = model
    return df_

In [None]:
# Get information off the main makes page
makes = list(car_dict.keys())
autotk = dict()
for make in makes:
    url = 'http://autotk.com/{}/'.format(make.lower())
    dftk, links = auto_tk_makepage(url)
    if dftk is None:
        continue
    autotk[make] = [dftk,links]
    print('{} completed'.format(make))
# Reduce link lists to the unique model links, not including years
unique_links = [np.nonzero([bool(re.search('^\/[^\/]*\/[^\/]*\/$',x)) for x in autotk[y][1]])[0]\
 for y in autotk]
unique_links = [x[0::2] for x in unique_links]

# Add link as column in dataframe
for i in range(len(unique_links)):
    autotk[makes[i]][0].loc[:,'link'] = [autotk[makes[i]][1][x] for x in unique_links[i]]

for make in makes:
    autotk[make][0] = split_name(autotk[make][0])        



In [None]:
for i_make in range(len(unique_links)):
    autotk[makes[i_make]][0].loc[:,'weight'] = np.nan
    autotk[makes[i_make]][0].loc[:,'zero60'] = np.nan
    for i_model in range(len(unique_links[i_make])):
        url = 'http://autotk.com{}'.format(autotk[makes[i_make]][0].link[i_model])
        zero60, weight = auto_tk_modelpage(url)
        autotk[makes[i_make]][0].loc[i_model,'weight'] = weight
        autotk[makes[i_make]][0].loc[i_model,'zero60'] = zero60

In [None]:
autotk_bak = autotk.copy()

In [None]:
autotk['Acura'][0].loc[:,'Make'] = 'Acura'

In [None]:
make = makes[0]

autotk_final = autotk[make][0]
for make in makes[1:]:    
    autotk_final = pd.concat([autotk_final,autotk[make][0]],sort=False)

In [None]:
with open('/Users/ahakso/Documents/gitDir/permileFlask/mysite/static/autotk.pkl','wb') as f:
    pickle.dump(autotk,f)

In [None]:
with open('/Users/ahakso/Documents/gitDir/permileFlask/mysite/static/autotk_final.pkl','wb') as f:
    pickle.dump(autotk_final,f)

In [None]:
with open('/Users/ahakso/Documents/gitDir/permileFlask/mysite/static/autotk.pkl','rb') as f:
    autotk = pickle.load(f)

In [None]:
with open('/Users/ahakso/Documents/gitDir/permileFlask/mysite/static/autotk_final.pkl','rb') as f:
    autotk_final = pickle.load(f)