In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from bs4 import BeautifulSoup


#Importing the HTML from the website
URL = 'https://www.goodcarbadcar.net/2019-us-vehicle-sales-figures-by-model/'
page = requests.get(URL)
soup = BeautifulSoup(page.content)
table = soup.find_all('table')[0]



In [2]:
#Obtaining the table headers
headers = [th.get_text().strip() for th in table.find_all('th')]

#Parsing through the HTML to obtain the data and converting the data into a pandas data frame
rows1 = table.find_all('tbody')[0].find_all('tr')
data_rows1 = [[td.get_text().strip() for td in tr.find_all('td')] for tr in rows1]
car_sales = pd.DataFrame(data_rows1, columns = headers)


In [3]:
#Deleting the columns not needed for the analysis.
car_sales = car_sales[['Model', 'YTD']]

#Renaming many of the columns to make them easier to understand
#Capitalizing strings to make joining the data frames easier
car_sales['Make/Model'] = car_sales['Model'].str.upper()
car_sales['Annual Sales'] = car_sales['YTD']
del car_sales['Model']
del car_sales['YTD']

In [4]:
#Splitting the make and model names into two separate columns in the data frame.
#Alfa Romeo and Land Rover have 2 words in their make name, so I will split them differently.
splitted = car_sales['Make/Model'].str.split()
Makes = []
Models = []
for item in splitted:
    if item[0] == 'ALFA' or item[0] == 'LAND':
        Makes.append(item[0] + ' ' + item[1])
        Models.append(item[2:])
    else:
        Makes.append(item[0])
        Models.append(item[1:])
car_sales['Make'] = pd.Series(Makes)
car_sales['Model'] = pd.Series(Models)

#Rearranging the column order
car_sales = car_sales[['Make/Model', 'Make', 'Model', 'Annual Sales']]

In [5]:
#Defining a function to convert a list to a string.
def ListToString(string):  
    str1 = ' '   
    return (str1.join(string))

#The model names are returned as a list after the split. I will use the above function to convert them back to strings.
new_models = []
for item in car_sales['Model']:
    new_models.append(ListToString(item))
car_sales['Model'] = pd.Series(new_models)


#Cleaning up a few individual entries that should be one word
car_sales['Model'] = car_sales['Model'].str.replace('AMG GT', 'AMG')
car_sales['Model'] = car_sales['Model'].str.replace('MX-5 MIATA', 'MX-5')
car_sales['Model'] = car_sales['Model'].str.replace('PRIUS FAMILY', 'PRIUS')

#I will be analyzing fuel-powered cars, and Tesla makes electric cars. 
#Tesla would be an outlier in my analysis, so I will remove it.
car_sales = car_sales[car_sales['Make'] != 'TESLA']

In [6]:
#Final result: A cleaned data frame of fuel-powered cars where each row is one unique car and the columns represent the car's make, model, and 2019 annual sales.
car_sales[car_sales['Make'] == 'LEXUS']

Unnamed: 0,Make/Model,Make,Model,Annual Sales
93,LEXUS ES,LEXUS,ES,51336
124,LEXUS GS,LEXUS,GS,3378
127,LEXUS GX,LEXUS,GX,25945
139,LEXUS IS,LEXUS,IS,14920
147,LEXUS LC,LEXUS,LC,1219
150,LEXUS LS,LEXUS,LS,5528
151,LEXUS LX,LEXUS,LX,4718
175,LEXUS NX,LEXUS,NX,58715
212,LEXUS RC,LEXUS,RC,4591
220,LEXUS RX,LEXUS,RX,111036
