In [1]:
import pandas as pd
import streamlit as st
import plotly.express as px

## Project requirements

1. project structure
    - README.md
    - app.py
    - <name_of_your_dataset>.csv
    - notebooks
         - EDA.ipynb
    - .streamlit
    - config.toml 

2. web accessibility through the browser
    - use Streamlit and Render
    
3. contains at least one of each of the following:
    - at least one header with text (st.header)
    - at least one histogram
    - at least one scatterplot

In [2]:
data =  pd.read_csv('vehicles_us.csv')
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


## Data Cleanup - missing values, duplicates, change type
1. Convert missing values for cylinders to the median of all non-missing cylinder values
2. Convert missing values for odometer to the mean of all non-missing odometer values
3. Convert missing 'is_4wd' (either 0 - no, or 1 - yes) to median non-missing is_4wd values
4. Convert missing paint_color to 'unknown' string

In [3]:
# Missing Values: convert model_year, cylinders, odometer, and is_4wd to more suitable values

data['cylinders'] = data['cylinders'].fillna(data['cylinders'].median())
data['odometer'] = data['odometer'].fillna(data['odometer'].mean())
data['is_4wd'] = data['is_4wd'].fillna(data['is_4wd'].median()).astype(int)
data['model_year'] = data['model_year'].fillna(data['model_year'].median())
data['paint_color'] = data['paint_color'].fillna('unknown')


<b> There was a reviewer comment to adjust model_year, but I don't really use it in my graphs, so I decided not to modify it. </b>

In [4]:
# Format Change: convert date_posted to date_time
data['date_posted'] = pd.to_datetime(data['date_posted'], format = '%Y-%m-%d')


In [5]:
# Remove Duplicates:
data = data.drop_duplicates(subset = None, keep = 'first')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         51525 non-null  int64         
 1   model_year    51525 non-null  float64       
 2   model         51525 non-null  object        
 3   condition     51525 non-null  object        
 4   cylinders     51525 non-null  float64       
 5   fuel          51525 non-null  object        
 6   odometer      51525 non-null  float64       
 7   transmission  51525 non-null  object        
 8   type          51525 non-null  object        
 9   paint_color   51525 non-null  object        
 10  is_4wd        51525 non-null  int32         
 11  date_posted   51525 non-null  datetime64[ns]
 12  days_listed   51525 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int32(1), int64(2), object(6)
memory usage: 5.3+ MB


In [7]:
# creating title for page

st.header("Pricing Analysis")

2022-12-08 09:57:10.252 
  command:

    streamlit run C:\Users\teres\anaconda3\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)

In [8]:
# create options for histogram elements by creating selectbox
list_for_hist = ['is_4wd','model','condition']
choice_for_hist = st.selectbox('Choose Options', list_for_hist)

# create histogram using plotly-express
fig1 = px.histogram(data, x ="price", color = choice_for_hist)
fig1.update_layout(
title = '<b>Price Analysis -- {}</b>'.format(choice_for_hist))

# embedding into streamlit
st.plotly_chart(fig1)

DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)

In [9]:
# create distribution for scatterplot
list_for_scatter = ['model', 'condition', 'odometer']
choice_for_scatter = st.selectbox('Choose Factors: ',list_for_scatter)

# create scatterplot using plotly-express
fig2 = px.scatter(data,x = choice_for_scatter, y = 'price')
fig2.update_layout(
title = '<b>Price based on {}</b>'.format(choice_for_scatter))
st.plotly_chart(fig2)

DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)