A framework for web-scraping car model duration data, and for plotting survival curves (to be implemented in Dash web app)

In [75]:
import requests
import time
import pandas as pd
pd.set_option('display.max_rows', 500)
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter

In [76]:
# list of all car makes sold for US audiences since 1999
makes = ['acura', 'alfa-romeo', 'am-general', 'aston-martin', 'audi', 'bentley', 'bmw', 'bugatti', 'buick', 'cadillac', 'chevrolet', 'chrysler', 'daewoo', 'dodge', 'ferrari', 'fiat', 'fisker', 'ford', 'genesis', 'gmc', 'honda', 'hummer', 'hyundai', 'infiniti', 'isuzu', 'jaguar', 'jeep', 'kia', 'lamborghini', 'land-rover', 'lexus', 'lincoln', 'lotus', 'maserati', 'maybach', 'mazda', 'mclaren', 'mercedes-benz', 'mercury', 'mini', 'mitsubishi', 'nissan', 'oldsmobile', 'panoz', 'plymouth', 'pontiac', 'porsche', 'ram', 'rolls-royce', 'saab', 'saleen', 'saturn', 'scion', 'smart', 'spyker', 'subaru', 'suzuki', 'tesla', 'toyota', 'volkswagen', 'volvo']


In [77]:
items = []

In [78]:
# for each make, scraping list of strings that contain information on cars made each year, 1999 to 2020

for make in makes:
    for year in range(1999, 2021):
        URL = 'https://www.autobytel.com/'+make+'/'+str(year)+'/'
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')
        main = soup.find_all('div', class_ = "well")
        for a in main[0].find_all('a', href=True):
            if len(a['href']) > 1:
                items.append(a['href'])
        print(make, year, 'complete')
        time.sleep(1)

acura 1999 complete
acura 2000 complete
acura 2001 complete


KeyboardInterrupt: 

In [79]:
# creating dataframe of scraping results
df = pd.DataFrame(items)
df.columns = ['string']

In [80]:
# parsing strings to extract make, model, year of each available car each year
df.string = df.string.str.lower()

df = df.dropna()

df.string = df.string.str.split('/')

df.loc[:, 'make'] = df.string.map(lambda x: x[1])
df.loc[:, 'model'] = df.string.map(lambda x: x[2])
df.loc[:, 'year'] = df.string.map(lambda x: x[3])

df = df.drop(columns = ['string'])
df = df.drop_duplicates()

In [81]:
# creating a dataframe of the life-span of each car
df1 = df.groupby(['make','model']).agg({'year' : [np.min, np.max]})
df1.columns = ["_".join(x) for x in df1.columns.ravel()]
df1 = df1.reset_index()

In [82]:
# change dataframe types
df1.year_amax = df1.year_amax.astype(int)
df1.year_amin = df1.year_amin.astype(int)

In [83]:
# calculate car durations and 'deaths' for survival analysis
# a car has 'died' if it was discontinued before the present year
df1['duration'] = df1.year_amax - df1.year_amin
df1['death'] = df1.year_amax.apply(lambda x: 1 if x<2020 else 0)

In [84]:
# fitting a Kaplan-Meier survival curve
kmf = KaplanMeierFitter()
kmf.fit(durations = df1.duration, event_observed = df1.death)

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 8 total observations, 0 right-censored observations>

In [85]:
kmf.event_table

Unnamed: 0_level_0,removed,observed,censored,entrance,at_risk
event_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2,2,0,8,8
2,6,6,0,0,6


In [86]:
# dataframe of survival estimates by year
df2 = pd.DataFrame(kmf.predict(range(0,22)))

In [87]:
df2

Unnamed: 0,KM_estimate
0,0.75
1,0.75
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
