In [2]:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pickle

### Import functions unique to this project

In [3]:
from master_functions import get_car_urls
from master_functions import make_model_df

## VOLVO - Create master dataframe for all models

This worksheet is intended to build a datafame for all **Volvo** cars.  There are some 'quirks' in how the data is structured from its source on https://www.fueleconomy.gov/, so more manual steps are taken below to check files for issues, combine first what is 'normal' and then add in those that required special attention.

**Step #1:** Create unique urls for every car model for the years 1984 - 2021<br>
- Uses `get_car_urls` from master function, Inputs: (car_make, [list of all models])

In [4]:
volvo_urls = get_car_urls('Volvo',
                          ['240','740/760','780',
                           '850','940','960',
                           '960 Wagon/V90','960/S90',
                           'C30','C70','Coupe','S40',
                           'S60','S70','S80','S90',
                           'V40','V50','V60','V70',
                           'V70','V70 XC','V90','XC40',
                           'XC60','XC70','SC90'
                          ])

-

**Step #2:** Get length of list created in Step 1.  This number will be how many times you run the function in Step 3 to check all of the urls<br>

In [5]:
# Verify number of urls and use this number
# to know how many urls need to 'check below'

len(volvo_urls)

27

-

**Step #3:** Check all of the urls you just created.<br>
- If does not work, add to 'problem' URLs string below this cell

In [34]:
# Test area for each url with [carmake]_urls[index]
# by seeing if data appears correctly

make_model_df('Volvo',volvo_urls[0])

Unnamed: 0,year,make,model,capacity_liters,cylinders,transmission,trans_speed,fuel_type,gg_emissions,mpg
0,1984,Volvo,240 DL/GL/Turbo,2.4,6,Manual,5,Diesel,392,26
1,1984,Volvo,240 DL/GL/Turbo Wagon,2.4,6,Manual,5,Diesel,392,26
2,1984,Volvo,240 DL/GL/Turbo,2.4,6,Automatic,3,Diesel,424,24
3,1984,Volvo,240 DL/GL/Turbo Wagon,2.4,6,Automatic,3,Diesel,424,24
4,1984,Volvo,240 DL/GL/Turbo,2.3,4,Manual,5,Regular Gasoline,386,23
5,1984,Volvo,240 DL/GL/Turbo,2.3,4,Manual,5,Regular Gasoline,386,23
6,1986,Volvo,240 DL/240 GL,2.3,4,Manual,4,Regular Gasoline,404,22
7,1986,Volvo,240 DL/240 GL Wagon,2.3,4,Manual,4,Regular Gasoline,404,22
8,1985,Volvo,240 DL/GL/turbo,2.3,4,Manual,4,Regular Gasoline,404,22
9,1984,Volvo,240 DL/GL/Turbo,2.3,4,Manual,5,Regular Gasoline,404,22


-

Populate this section so if there are any 'problem' URLs from your test above

In [35]:
#'Problem' URLs
'''
volvo_urls[12]
volvo_urls[15]
volvo_urls[18]
volvo_urls[24]
'''

#Print list length again to 
#set length of range in next cell
len(volvo_urls)

27

-

**Step #4:** Create dfs for all 'okay' urls and place each into a master list
- Automate where possible, but some may need to be added one by one to avoid 'problem' urls

In [37]:
# for 'normal' urls to make a df and add to master df list, automate it!

volvo_dfs = []

for x in range(0,12):
    volvo_dfs.append(make_model_df('Volvo',volvo_urls[x]))

for x in range(13,15):
    volvo_dfs.append(make_model_df('Volvo',volvo_urls[x]))

for x in range(16,18):
    volvo_dfs.append(make_model_df('Volvo',volvo_urls[x]))

for x in range(19,24):
    volvo_dfs.append(make_model_df('Volvo',volvo_urls[x]))

for x in range(25,27):
    volvo_dfs.append(make_model_df('Volvo',volvo_urls[x]))


-

**Step #5:** Concatenate all of the 'normal' car model dfs into one master dataframe

In [38]:
volvo_dfs = pd.concat(volvo_dfs, ignore_index=True)

volvo_dfs

Unnamed: 0,year,make,model,capacity_liters,cylinders,transmission,trans_speed,fuel_type,gg_emissions,mpg
0,1984.0,Volvo,240 DL/GL/Turbo,2.4,6.0,Manual,5,Diesel,392.0,26.0
1,1984.0,Volvo,240 DL/GL/Turbo Wagon,2.4,6.0,Manual,5,Diesel,392.0,26.0
2,1984.0,Volvo,240 DL/GL/Turbo,2.4,6.0,Automatic,3,Diesel,424.0,24.0
3,1984.0,Volvo,240 DL/GL/Turbo Wagon,2.4,6.0,Automatic,3,Diesel,424.0,24.0
4,1984.0,Volvo,240 DL/GL/Turbo,2.3,4.0,Manual,5,Regular Gasoline,386.0,23.0
...,...,...,...,...,...,...,...,...,...,...
165,2014.0,Volvo,XC70 FWD,3.2,6.0,Automatic,S6,Regular Gasoline,416.0,21.0
166,2015.0,Volvo,XC70 AWD,3.2,6.0,Automatic,S6,Regular Gasoline,420.0,21.0
167,2014.0,Volvo,XC70 AWD,3.2,6.0,Automatic,S6,Regular Gasoline,420.0,21.0
168,2011.0,Volvo,XC70 FWD,3.2,6.0,Automatic,S6,Regular Gasoline,423.0,21.0


-

**Step #6:** Pickle the dataframe made in Step 6 of all car's models with 'normal' dataframes
- Will now be saved so further work on dataframe can start at this place

In [39]:
with open('pickles/volvo_dfs.pickle', 'wb') as to_write:
    pickle.dump(volvo_dfs, to_write)

-

**Step #7:** Un-pickle the dataframe made in Step 7 of all car's models with 'normal' dataframes

In [40]:
with open('pickles/volvo_dfs.pickle','rb') as read_file:
    volvo_dfs = pickle.load(read_file)