In [2]:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pickle

### Import functions unique to this project

In [3]:
from master_functions import get_car_urls
from master_functions import make_model_df

## PORSCHE - Create master dataframe for all models

This worksheet is intended to build a datafame for all **Porsche** cars.  There are some 'quirks' in how the data is structured from its source on https://www.fueleconomy.gov/, so more manual steps are taken below to check files for issues, combine first what is 'normal' and then add in those that required special attention.

**Step #1:** Create unique urls for every car model for the years 1984 - 2021<br>
- Uses `get_car_urls` from master function, Inputs: (car_make, [list of all models])

In [4]:
porsche_urls = get_car_urls('Porsche',
                            ['718','911','918 Spyder',
                             '924','928','944','968',
                             'Boxster','Carrera','Carrera 2 911 GT3',
                             'Carrera 2 Cabriolet Kit',
                             'Carrera 2 Coupe Kit',
                             'Carrera 2 S Cabriolet',
                             'Carrera 2 S Coupe',
                             'Carrera 4','Carrera 4 Cabriolet',
                             'Carrera 4 Cabriolet Kit',
                             'Carrera 4 Coupe','Carrera 4 S',
                             'Carrera 4 S Cabriolet',
                             'Carrera 4 S Cabriolet Kit',
                             'Carrera 4 S Coupe',
                             'Carrera 4 S Kit',
                             'Carrera 4 S Targa',
                             'Carrera 4 Targa',
                             'Carrera GT','Cayenne',
                             'Cayman','Macan','New 911 Carrera',
                             'Panamera','Targa','Taycan','Turbo',
                             'Turbo 2 911 Gt2','Turbo 4 911',
                             'Turbo 911 Cab','Turbo 4 911 Cab Kit',
                             'Turbo 4 911 Kit','Turbo 4 911 S',
                             'Turbo 4 911 Turbo',
                             'Turbo 4 911 Cab S','Turbo GT2',
                             'Turbo Kit'
                            ])

-

**Step #2:** Get length of list created in Step 1.  This number will be how many times you run the function in Step 3 to check all of the urls<br>

In [5]:
# Verify number of urls and use this number
# to know how many urls need to 'check below'

len(porsche_urls)

44

-

**Step #3:** Check all of the urls you just created.<br>
- If does not work, add to 'problem' URLs string below this cell

In [52]:
# Test area for each url with [carmake]_urls[index]
# by seeing if data appears correctly

make_model_df('Porsche',porsche_urls[43])

Unnamed: 0,year,make,model,capacity_liters,cylinders,transmission,trans_speed,fuel_type,gg_emissions,mpg
0,2003,Porsche,Turbo Kit,3.6,6,Manual,6,Premium Gasoline,555,16
1,2003,Porsche,Turbo Kit,3.6,6,Automatic,5,Premium Gasoline,555,16


-

Populate this section so if there are any 'problem' URLs from your test above

In [53]:
#'Problem' URLs
'''
porsche_urls[26]
porsche_urls[30]
porsche_urls[32]
'''

#Print list length again to 
#set length of range in next cell
len(porsche_urls)

44

-

**Step #4:** Create dfs for all 'okay' urls and place each into a master list
- Automate where possible, but some may need to be added one by one to avoid 'problem' urls

In [54]:
# for 'normal' urls to make a df and add to master df list, automate it!

porsche_dfs = []

for x in range(0,26):
    porsche_dfs.append(make_model_df('Porsche',porsche_urls[x]))

for x in range(27,30):
    porsche_dfs.append(make_model_df('Porsche',porsche_urls[x]))

porsche_dfs.append(make_model_df('Porsche',porsche_urls[31]))

for x in range(33,44):
    porsche_dfs.append(make_model_df('Porsche',porsche_urls[x]))
    



-

**Step #5:** Concatenate all of the 'normal' car model dfs into one master dataframe

In [55]:
porsche_dfs = pd.concat(porsche_dfs, ignore_index=True)

porsche_dfs

Unnamed: 0,year,make,model,capacity_liters,cylinders,transmission,trans_speed,fuel_type,gg_emissions,mpg
0,2020.0,Porsche,718 Cayman GT4,4.0,6.0,Manual,6,Premium Gasoline,473.0,19.0
1,2020.0,Porsche,718 Spyder,4.0,6.0,Manual,6,Premium Gasoline,473.0,19.0
2,2019.0,Porsche,911 Carrera,3.0,6.0,Automatic,AM-S7,Premium Gasoline,355.0,25.0
3,2018.0,Porsche,911 Carrera,3.0,6.0,Automatic,AM-S7,Premium Gasoline,355.0,25.0
4,2017.0,Porsche,911 Carrera,3.0,6.0,Automatic,AM-S7,Premium Gasoline,355.0,25.0
...,...,...,...,...,...,...,...,...,...,...
210,2007.0,Porsche,Turbo 4 911 Turbo,3.6,6.0,Automatic,5,Premium Gasoline,494.0,18.0
211,2005.0,Porsche,Turbo 4 911 Turbo,3.6,6.0,Automatic,5,Premium Gasoline,555.0,16.0
212,2005.0,Porsche,Turbo 4 911 Turbo,3.6,6.0,Manual,6,Premium Gasoline,555.0,16.0
213,2003.0,Porsche,Turbo Kit,3.6,6.0,Manual,6,Premium Gasoline,555.0,16.0


-

**Step #6:** Pickle the dataframe made in Step 6 of all car's models with 'normal' dataframes
- Will now be saved so further work on dataframe can start at this place

In [56]:
with open('pickles/porsche_dfs.pickle', 'wb') as to_write:
    pickle.dump(porsche_dfs, to_write)

-

**Step #7:** Un-pickle the dataframe made in Step 7 of all car's models with 'normal' dataframes

In [57]:
with open('pickles/porsche_dfs.pickle','rb') as read_file:
    porsche_dfs = pickle.load(read_file)