In [1]:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pickle

### Import functions unique to this project

In [2]:
from master_functions import get_car_urls
from master_functions import make_model_df

## AUDI - Create master dataframe for all Audi models

This worksheet is intended to build a datafame for all **Audi** cars.  There are some 'quirks' in how the data is structured from its source on https://www.fueleconomy.gov/, so more manual steps are taken below to check files for issues, combine first what is 'normal' and then add in those that required special attention.

**Step #1:** Create unique urls for every Audi model for the years 1984 - 2021<br>
- Uses `get_car_urls` from master function, Inputs: (car_make, [list of all models])

In [3]:
audi_urls = get_car_urls('Audi',
                         ['100','200','4000','5000','80/90','A3',
                          'A4','A5','A6','A7','A8','Allroad',
                          'Cabriolet','Coupe GT','Coupe quatrro',
                          'e-tron','Q3','Q5','Q7','Q8','Quattro',
                          'R8','R8 Spyder','RS','RS 3','RS 4',
                          'RS 5','RS 6','RS 7','RS Q8','S3','S4',
                          'S5','S6','S7','S8','SQ5','SQ7','SQ8',
                          'TT','TT Coupe','TT Roadster','TT RS',
                          'TTS','TTS Coupe','V8'
                         ])

-

**Step #2:** Get length of list created in Step 1.  This number will be how many times you run the function in Step 3 to check all of the urls<br>

In [4]:
# Verify number of urls and use this number
# to know how many urls need to 'check below'

len(audi_urls)

46

-

**Step #3:** Check all of the urls you just created.<br>
- If works, add to `audi_okay_modelranks` list in Step 4
- If does not work, add to 'problem' URLs string in Step 4

In [5]:
# Test area for each url with audi_urls[index]
# by seeing if data appears correctly

make_model_df('Audi',audi_urls[45])

Unnamed: 0,year,make,model,capacity_liters,cylinders,transmission,trans_speed,fuel_type,gg_emissions,mpg
0,1991,Audi,V8,3.6,8,Automatic,4,Regular Gasoline,592,15
1,1991,Audi,V8,3.6,8,Manual,5,Regular Gasoline,592,15
2,1990,Audi,V8,3.6,8,Manual,5,Regular Gasoline,592,15
3,1994,Audi,V8,4.2,8,Automatic,4,Premium Gasoline,592,15
4,1993,Audi,V8,4.2,8,Automatic,4,Premium Gasoline,592,15
5,1992,Audi,V8,4.2,8,Automatic,4,Premium Gasoline,592,15
6,1990,Audi,V8,3.6,8,Automatic,4,Regular Gasoline,635,14


-

**Step #4:** From Step 3 above you should have populated this section so URLs either are in the category of "problem" or "normal"

In [6]:
#'Problem' URLs
'''
audi_urls[5]
audi_urls[10]
audi_urls[11]
audi_urls[14]
audi_urls[15]
audi_urls[17]
'''

#'Normal' URLs

audi_okay_modelranks = [0,1,2,3,4,6,7,8,9,
                        12,13,16,18,19,20,
                        21,22,23,24,25,26,
                        27,28,29,30,31,32,
                        33,34,35,36,37,38,
                        39,40,41,42,43,44,
                        45
                       ]

-

**Step #5:** Create dfs for all 'okay' urls and place each into a master list
- Automate where possible, but some may need to be added one by one to avoid 'problem' urls

In [7]:
audi_0 = make_model_df('Audi',audi_urls[0])
audi_1 = make_model_df('Audi',audi_urls[1])
audi_2 = make_model_df('Audi',audi_urls[2])
audi_3 = make_model_df('Audi',audi_urls[3])
audi_4 = make_model_df('Audi',audi_urls[4])
# audi_5 = make_model_df('Audi',audi_urls[5]) # problem url left out
audi_6 = make_model_df('Audi',audi_urls[6])
audi_7 = make_model_df('Audi',audi_urls[7])
audi_8 = make_model_df('Audi',audi_urls[8])
audi_9 = make_model_df('Audi',audi_urls[9])
# audi_10 = make_model_df('Audi','Audi',audi_urls[10]) # problem url left out
# audi_11 = make_model_df('Audi',audi_urls[11]) # problem url left out
audi_12 = make_model_df('Audi',audi_urls[12])
audi_13 = make_model_df('Audi',audi_urls[13])
audi_14 = make_model_df('Audi',audi_urls[14])
# audi_15 = make_model_df('Audi',audi_urls[15]) # problem url left out
audi_16 = make_model_df('Audi',audi_urls[16])
# audi_17 = make_model_df('Audi',audi_urls[17])



In [8]:
audi_dfs = [audi_0,
            audi_1,
            audi_2,
            audi_3,
            audi_4,
            audi_6,
            audi_7,
            audi_8,
            audi_9, 
            audi_12,
            audi_13,
            audi_14,
            audi_16,
           ]

len(audi_dfs)

13

In [9]:
# for remaining 'normal' urls to make a df and add to master df list, automate it!

for x in range(18,46):
    audi_dfs.append(make_model_df('Audi',audi_urls[x]))

-

**Step #6:** Concatenate all of the 'normal' Audi model dfs into one master dataframe

In [10]:
audi_dfs = pd.concat(audi_dfs, ignore_index=True)

audi_dfs

Unnamed: 0,year,make,model,capacity_liters,cylinders,transmission,trans_speed,fuel_type,gg_emissions,mpg
0,1993.0,Audi,100,2.8,6.0,Manual,5,Premium Gasoline,468.0,19.0
1,1992.0,Audi,100,2.8,6.0,Manual,5,Premium Gasoline,468.0,19.0
2,1994.0,Audi,100,2.8,6.0,Manual,5,Premium Gasoline,468.0,19.0
3,1993.0,Audi,100,2.8,6.0,Automatic,4,Premium Gasoline,468.0,19.0
4,1992.0,Audi,100,2.8,6.0,Automatic,4,Premium Gasoline,468.0,19.0
...,...,...,...,...,...,...,...,...,...,...
269,1990.0,Audi,V8,3.6,8.0,Manual,5,Regular Gasoline,592.0,15.0
270,1994.0,Audi,V8,4.2,8.0,Automatic,4,Premium Gasoline,592.0,15.0
271,1993.0,Audi,V8,4.2,8.0,Automatic,4,Premium Gasoline,592.0,15.0
272,1992.0,Audi,V8,4.2,8.0,Automatic,4,Premium Gasoline,592.0,15.0


-

**Step #7:** Pickle the dataframe made in Step 6 of all Audi models with 'normal' dataframes
- Will now be saved so further work on dataframe can start at this place

In [11]:
with open('pickles/audi_dfs.pickle', 'wb') as to_write:
    pickle.dump(audi_dfs, to_write)

-

**Step #8:** Un-pickle the dataframe made in Step 7 of all Audi models with 'normal' dataframes

In [12]:
with open('pickles/audi_dfs.pickle','rb') as read_file:
    audi_dfs = pickle.load(read_file)