In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from scipy import stats
import streamlit as st
import plotly.express as px

from IPython.display import display

In [2]:
# Load original DataFrame from .csv file.
df_og = pd.read_csv('../vehicles_us.csv')


In [3]:
# View DataFrame
display(df_og)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,,2018-10-03,37
51521,2700,2002.0,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,,2018-11-14,22
51522,3950,2009.0,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,,2018-11-15,32
51523,7455,2013.0,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,,2018-07-02,71


In [4]:
# View DataFrame info
df_og.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [5]:
# Check for duplicates
print(df_og.duplicated().sum())
print()
print(df_og.duplicated(keep = False).sum())


0

0


There are no duplicate rows in this dataset.

In [6]:
# Calculate missing values of entire DataFrame.
print(df_og.isna().sum())

price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64


There are several missing values in this dataset.  We could either decide to remove null values or we could decide to take a deeper look into the parts of the data where there are no null values.  Let's choose the latter for now.  

The model column has zero missing values but actually consists of both the make and the model component of the vehicle.  We can use the split() function separate the two compinents.

In [7]:
# fork original data frame into one that will be modified
df_a1 = df_og

# create make_n_model column from model column
df_a1['make_n_model'] = df_a1['model']

# remove original model column
df_a1 = df_a1.drop(['model'], axis=1)

# new data frame with split value columns
new = df_a1['make_n_model'].str.split(" ", n=1, expand=True)
 
# making separate make column from new data frame
df_a1['make'] = new[0]
 
# making separate model column from new data frame
df_a1['model'] = new[1]

# create separate make_n_type column using .agg()
df_a1['make_n_type'] = df_a1[['make', 'type']].agg(' '.join, axis=1)

In [8]:
display(df_a1)

Unnamed: 0,price,model_year,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make_n_model,make,model,make_n_type
0,9400,2011.0,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19,bmw x5,bmw,x5,bmw SUV
1,25500,,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50,ford f-150,ford,f-150,ford pickup
2,5500,2013.0,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79,hyundai sonata,hyundai,sonata,hyundai sedan
3,1500,2003.0,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9,ford f-150,ford,f-150,ford pickup
4,14900,2017.0,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28,chrysler 200,chrysler,200,chrysler sedan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,like new,6.0,gas,88136.0,automatic,sedan,black,,2018-10-03,37,nissan maxima,nissan,maxima,nissan sedan
51521,2700,2002.0,salvage,4.0,gas,181500.0,automatic,sedan,white,,2018-11-14,22,honda civic,honda,civic,honda sedan
51522,3950,2009.0,excellent,4.0,gas,128000.0,automatic,sedan,blue,,2018-11-15,32,hyundai sonata,hyundai,sonata,hyundai sedan
51523,7455,2013.0,good,4.0,gas,139573.0,automatic,sedan,black,,2018-07-02,71,toyota corolla,toyota,corolla,toyota sedan


In [9]:
df_a1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   condition     51525 non-null  object 
 3   cylinders     46265 non-null  float64
 4   fuel          51525 non-null  object 
 5   odometer      43633 non-null  float64
 6   transmission  51525 non-null  object 
 7   type          51525 non-null  object 
 8   paint_color   42258 non-null  object 
 9   is_4wd        25572 non-null  float64
 10  date_posted   51525 non-null  object 
 11  days_listed   51525 non-null  int64  
 12  make_n_model  51525 non-null  object 
 13  make          51525 non-null  object 
 14  model         51525 non-null  object 
 15  make_n_type   51525 non-null  object 
dtypes: float64(4), int64(2), object(10)
memory usage: 6.3+ MB


There is now both a make column and a model column.

In [10]:
df_a1['price'].max()

375000

In [11]:
# Create a histogram of vehicle prices
fig1 = px.histogram(df_a1, x='price',
                  title="Distribution of Used Vehicle Prices")

# Show figure of plotly express histogram
fig1.show()

# Use this to show figure of plotly express histogram 
# in 'app.py' for the Render Dashboard
# st.plotly_chart(fig1)

In [12]:
df_a2 = df_a1.loc[df_a1['price'] < 100000]

In [13]:
# Create a histogram of vehicle prices
fig2 = px.histogram(df_a2, x='price',
                  title="Distribution of Vehicle Prices",
                  nbins=20)

# Show figure of plotly express histogram
fig2.show()

# Use this to show figure of plotly express histogram 
# in 'app.py' for the Render Dashboard
# st.plotly_chart(fig)

In [14]:
# Remove 'is_4wd' column
df_b1 = df_a2.drop(['is_4wd'], axis=1)

# Remove rows with null values
df_b1 = df_b1.dropna(axis = 0)

In [15]:
# change model year from float to integer
df_b1['model_year'] = df_b1['model_year'].astype('int')


In [16]:
# Change cylinder count from float to integer
df_b1['cylinders'] = df_b1['cylinders'].astype('int')


In [17]:

df_b1['date_posted'] = pd.to_datetime(df_b1['date_posted']) 

In [18]:
display(df_b1)

Unnamed: 0,price,model_year,condition,cylinders,fuel,odometer,transmission,type,paint_color,date_posted,days_listed,make_n_model,make,model,make_n_type
2,5500,2013,like new,4,gas,110000.0,automatic,sedan,red,2019-02-07,79,hyundai sonata,hyundai,sonata,hyundai sedan
4,14900,2017,excellent,4,gas,80903.0,automatic,sedan,black,2019-04-02,28,chrysler 200,chrysler,200,chrysler sedan
5,14990,2014,excellent,6,gas,57954.0,automatic,sedan,black,2018-06-20,15,chrysler 300,chrysler,300,chrysler sedan
6,12990,2015,excellent,4,gas,79212.0,automatic,sedan,white,2018-12-27,73,toyota camry,toyota,camry,toyota sedan
7,15990,2013,excellent,6,gas,109473.0,automatic,SUV,black,2019-01-07,68,honda pilot,honda,pilot,honda SUV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51518,3750,2005,excellent,6,gas,110200.0,automatic,sedan,silver,2018-08-10,63,ford taurus,ford,taurus,ford sedan
51520,9249,2013,like new,6,gas,88136.0,automatic,sedan,black,2018-10-03,37,nissan maxima,nissan,maxima,nissan sedan
51521,2700,2002,salvage,4,gas,181500.0,automatic,sedan,white,2018-11-14,22,honda civic,honda,civic,honda sedan
51522,3950,2009,excellent,4,gas,128000.0,automatic,sedan,blue,2018-11-15,32,hyundai sonata,hyundai,sonata,hyundai sedan


In [19]:
df_b1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29906 entries, 2 to 51523
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         29906 non-null  int64         
 1   model_year    29906 non-null  int64         
 2   condition     29906 non-null  object        
 3   cylinders     29906 non-null  int64         
 4   fuel          29906 non-null  object        
 5   odometer      29906 non-null  float64       
 6   transmission  29906 non-null  object        
 7   type          29906 non-null  object        
 8   paint_color   29906 non-null  object        
 9   date_posted   29906 non-null  datetime64[ns]
 10  days_listed   29906 non-null  int64         
 11  make_n_model  29906 non-null  object        
 12  make          29906 non-null  object        
 13  model         29906 non-null  object        
 14  make_n_type   29906 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(4

In [20]:
print(sorted(list(df_b1['condition'].unique())))

['excellent', 'fair', 'good', 'like new', 'new', 'salvage']


In [22]:
# Create scatterplot with 'price' and 'odometer' elements
fig3 = px.scatter(df_b1, x='odometer', y='price', title='Average Price per Odometer Reading', color ='type')

# Show figure of plotly express scatterplot
fig3.show()