In [132]:
# 1. Import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st


In [133]:
df = pd.read_csv("vehicles_us.csv")

In [134]:
df.head(10)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
5,14990,2014.0,chrysler 300,excellent,6.0,gas,57954.0,automatic,sedan,black,1.0,2018-06-20,15
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,,2018-12-27,73
7,15990,2013.0,honda pilot,excellent,6.0,gas,109473.0,automatic,SUV,black,1.0,2019-01-07,68
8,11500,2012.0,kia sorento,excellent,4.0,gas,104174.0,automatic,SUV,,1.0,2018-07-16,19
9,9200,2008.0,honda pilot,excellent,,gas,147191.0,automatic,SUV,blue,1.0,2019-02-15,17


In [135]:
df.shape

(51525, 13)

In [136]:
df.dtypes

price             int64
model_year      float64
model            object
condition        object
cylinders       float64
fuel             object
odometer        float64
transmission     object
type             object
paint_color      object
is_4wd          float64
date_posted      object
days_listed       int64
dtype: object

In [137]:
df.isna().sum()

price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

In [138]:
df.describe()

Unnamed: 0,price,model_year,cylinders,odometer,is_4wd,days_listed
count,51525.0,47906.0,46265.0,43633.0,25572.0,51525.0
mean,12132.46492,2009.75047,6.125235,115553.461738,1.0,39.55476
std,10040.803015,6.282065,1.66036,65094.611341,0.0,28.20427
min,1.0,1908.0,3.0,0.0,1.0,0.0
25%,5000.0,2006.0,4.0,70000.0,1.0,19.0
50%,9000.0,2011.0,6.0,113000.0,1.0,33.0
75%,16839.0,2014.0,8.0,155000.0,1.0,53.0
max,375000.0,2019.0,12.0,990000.0,1.0,271.0


In [139]:
df.describe(include="object")

Unnamed: 0,model,condition,fuel,transmission,type,paint_color,date_posted
count,51525,51525,51525,51525,51525,42258,51525
unique,100,6,5,3,13,12,354
top,ford f-150,excellent,gas,automatic,SUV,white,2019-03-17
freq,2796,24773,47288,46902,12405,10029,186


In [140]:
df = df.dropna(subset=['model_year', 'odometer', 'cylinders', 'paint_color'])
df['is_4wd'] = df['is_4wd'].fillna(0).astype(int)

In [141]:
numeric_cols = df.select_dtypes(include="number").columns

In [142]:
st.title("Exploratory Data Analysis of Vehicles Dataset")

# 5. Distribution plots
st.subheader("Distributions of Numeric Columns")
for col in numeric_cols:
    fig = px.histogram(
        df, x=col, nbins=30, 
        title=f"Distribution of {col}",
        marginal="box"
    )
    st.plotly_chart(fig)
    



In [143]:
top_models = df['model'].value_counts().head(10).reset_index()
top_models.columns = ['model', 'count'] 

In [144]:
fig_1 = px.bar(
    top_models,
    x='model',
    y='count',
    title="Top 10 Most Common Models"
)
st.plotly_chart(fig_1)



DeltaGenerator()

In [145]:
# 7. Price vs Odometer scatter plot
st.subheader("Price vs Odometer")
fig = px.scatter(
    df, x='odometer', y='price',
    title="Price vs Odometer", opacity=0.5
)
st.plotly_chart(fig)




DeltaGenerator()

In [146]:
# 8. Boxplot: Price by condition
st.subheader("Price by Vehicle Condition")
fig = px.box(
    df, x='condition', y='price',
    title="Price by Vehicle Condition"
)
st.plotly_chart(fig)



DeltaGenerator()

In [147]:
# 9. Correlation heatmap
st.subheader("Correlation Heatmap")
corr = df[numeric_cols].corr()
fig = px.imshow(
    corr, text_auto=True, color_continuous_scale="RdBu_r",
    title="Correlation Heatmap"
)
st.plotly_chart(fig)



DeltaGenerator()

In [148]:
# 10. Days listed distribution
st.subheader("Distribution of Days Listed")
fig = px.histogram(
    df, x='days_listed', nbins=30,
    title="Distribution of Days Listed"
)
st.plotly_chart(fig)



DeltaGenerator()

In [149]:
df['manufacturer'] = df['model'].apply(lambda x: x.split()[0])

In [150]:
st.header('Data viewer')
show_manuf_1k_ads = st.checkbox('Include manufacturers with less than 1000 ads')
if not show_manuf_1k_ads:
    df = df.groupby('manufacturer').filter(lambda x: len(x) > 1000)



In [151]:
st.dataframe(df)
st.header('Vehicle types by manufacturer')
st.write(px.histogram(df, x='manufacturer', color='type'))
st.header('Histogram of `condition` vs `model_year`')




DeltaGenerator()

In [152]:
st.write(px.histogram(df, x='model_year', color='condition'))



In [None]:
st.header('Compare price distribution between manufacturers')
manufac_list = sorted(df['manufacturer'].unique())
manufacturer_1 = st.selectbox('Select manufacturer 1',
                              manufac_list, index=manufac_list.index('chevrolet'))

manufacturer_2 = st.selectbox(
    'Select manufacturer 2',
    manufac_list,
    index=manufac_list.index('hyundai') if 'hyundai' in manufac_list else 0
)
mask_filter = (df['manufacturer'] == manufacturer_1) | (df['manufacturer'] == manufacturer_2)
df_filtered = df[mask_filter]
normalize = st.checkbox('Normalize histogram', value=True)
if normalize:
    histnorm = 'percent'
else:
    histnorm = None
st.write(px.histogram(df_filtered,
                      x='price',
                      nbins=30,
                      color='manufacturer',
                      histnorm=histnorm,
                      barmode='overlay'))

