In [1]:
import pandas as pd
import plotly.express as px
import altair as alt
import streamlit as st


# Load the Dataset Files

try:
    us_vehicles = pd.read_csv('vehicles_us.csv')
except FileNotFoundError:
    print("CSV file not found")

# Overview of Data Structure and Types
display("Dataset info:")
display(us_vehicles.info())

# This missing_values_below_row_count function finds columns that have missing values (empty or NaN) 
# How it works: 
# 1. Counts missing values in each column 
# 2. Finds columns that have any missing values 
# 3. Shows these columns and how many values are missing

def missing_values_below_row_count(us_vehicles):
    missing_values_count = us_vehicles.isnull().sum()
    print("Column with Missing Values:")
    for col in missing_values_count[missing_values_count > 0].index:
        display(f"**{col}**: {missing_values_count[col]} nulls")

missing_values_below_row_count(us_vehicles)

# Previewing First 10 Rows of the Dataframe
display("\nFirst Few Rows of Data:")
display(us_vehicles.head(10))

# Check if certain columns have decimal numbers 
# How it works: 
# 1. Look at 'model_year', 'cylinders', and 'is_4wd columns 
# 2. Remove missing values 
# 3. Use decimal_flags to find numbers with decimal points 
# 4. Tell us if the column can be changed to whole numbers safely

for col in ['model_year', 'cylinders', 'is_4wd']:
    decimal_flags = us_vehicles[col].dropna().apply(lambda x: x % 1 !=0)
    if any(decimal_flags):
        display(f"{col} has decimal flags")
    else:
        display(f"{col} can be safely converted to int64")

# Converting numeric columns to Int64 based on our earlier loop check that showed no decimal values
us_vehicles[['model_year', 'cylinders', 'is_4wd']] = us_vehicles[['model_year', 'cylinders', 'is_4wd']].astype('Int64')

# Check if model_year, cylinders, and is_4wd are now Int64 type
display(us_vehicles[['model_year', 'cylinders', 'is_4wd']].dtypes)

# Display unique colors and their counts
display(us_vehicles['paint_color'].value_counts())

# Display list of unique colors
display(us_vehicles['paint_color'].unique())

#Convert to lower case and remove spaces
us_vehicles['paint_color'] = us_vehicles['paint_color'].str.lower().str.strip()

# Recheck cleaned unique colors and their counts
display(us_vehicles['paint_color'].value_counts())

# Recheck cleaned list of unique colors
display(us_vehicles['paint_color'].unique())

#Convert data type from object to category
us_vehicles['paint_color'] = us_vehicles['paint_color'].astype('str').astype('category')

# Check if paint_color is now category type
display(us_vehicles['paint_color'].dtype)

type_counts = us_vehicles['type'].value_counts()
display(type_counts)

# Recheck cleaned first Few Rows
display("\nFirst Few Rows of Cleaned Data:")
display(us_vehicles.head(10))

# Basic Statisical Analysis
display("Statistical Summary:")
display(us_vehicles.describe())

# Checking for Missing Values in Each Column
display("Missing Dataset info:")
display(us_vehicles.isnull().sum())

columns_to_use = ['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel', 'odometer', 'transmission', 'type', 'paint_color']
us_vehicles = us_vehicles[columns_to_use]

# Recheck Statisical Analysis Review 
display("Statistical Summary:")
display(us_vehicles.describe())

def analyze_vehicle_charts(us_vehicles):

    # Price distribution histogram
    hist_price_by_type = px.histogram(us_vehicles, x='price', color='type', nbins=30,
                        title='Vehicle Price Distribution by Type')
    st.plotly_chart(hist_price_by_type)
    
    # Price vs Mileage scatter plot
    scatter_price_vs_mileage = px.scatter(us_vehicles, x='odometer', y='price', color='type',
                        title='Mileage Impact on Price Across Vehicle Types')
    st.plotly_chart(scatter_price_vs_mileage)
    
    # Technical attributes scatter matrix
    matrix_technical_features = px.scatter_matrix(us_vehicles,
        dimensions=['fuel', 'odometer', 'transmission', 'cylinders'],
        color='price')
    st.plotly_chart(matrix_technical_features)
    

    return hist_price_by_type, scatter_price_vs_mileage, matrix_technical_features     

'Dataset info:'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


None

Column with Missing Values:


'**model_year**: 3619 nulls'

'**cylinders**: 5260 nulls'

'**odometer**: 7892 nulls'

'**paint_color**: 9267 nulls'

'**is_4wd**: 25953 nulls'

'\nFirst Few Rows of Data:'

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
5,14990,2014.0,chrysler 300,excellent,6.0,gas,57954.0,automatic,sedan,black,1.0,2018-06-20,15
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,,2018-12-27,73
7,15990,2013.0,honda pilot,excellent,6.0,gas,109473.0,automatic,SUV,black,1.0,2019-01-07,68
8,11500,2012.0,kia sorento,excellent,4.0,gas,104174.0,automatic,SUV,,1.0,2018-07-16,19
9,9200,2008.0,honda pilot,excellent,,gas,147191.0,automatic,SUV,blue,1.0,2019-02-15,17


'model_year can be safely converted to int64'

'cylinders can be safely converted to int64'

'is_4wd can be safely converted to int64'

model_year    Int64
cylinders     Int64
is_4wd        Int64
dtype: object

paint_color
white     10029
black      7692
silver     6244
grey       5037
blue       4475
red        4421
green      1396
brown      1223
custom     1153
yellow      255
orange      231
purple      102
Name: count, dtype: int64

array([nan, 'white', 'red', 'black', 'blue', 'grey', 'silver', 'custom',
       'orange', 'yellow', 'brown', 'green', 'purple'], dtype=object)

paint_color
white     10029
black      7692
silver     6244
grey       5037
blue       4475
red        4421
green      1396
brown      1223
custom     1153
yellow      255
orange      231
purple      102
Name: count, dtype: int64

array([nan, 'white', 'red', 'black', 'blue', 'grey', 'silver', 'custom',
       'orange', 'yellow', 'brown', 'green', 'purple'], dtype=object)

CategoricalDtype(categories=['black', 'blue', 'brown', 'custom', 'green', 'grey', 'nan',
                  'orange', 'purple', 'red', 'silver', 'white', 'yellow'],
, ordered=False, categories_dtype=object)

type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64

'\nFirst Few Rows of Cleaned Data:'

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
5,14990,2014.0,chrysler 300,excellent,6.0,gas,57954.0,automatic,sedan,black,1.0,2018-06-20,15
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,,2018-12-27,73
7,15990,2013.0,honda pilot,excellent,6.0,gas,109473.0,automatic,SUV,black,1.0,2019-01-07,68
8,11500,2012.0,kia sorento,excellent,4.0,gas,104174.0,automatic,SUV,,1.0,2018-07-16,19
9,9200,2008.0,honda pilot,excellent,,gas,147191.0,automatic,SUV,blue,1.0,2019-02-15,17


'Statistical Summary:'

Unnamed: 0,price,model_year,cylinders,odometer,is_4wd,days_listed
count,51525.0,47906.0,46265.0,43633.0,25572.0,51525.0
mean,12132.46492,2009.75047,6.125235,115553.461738,1.0,39.55476
std,10040.803015,6.282065,1.66036,65094.611341,0.0,28.20427
min,1.0,1908.0,3.0,0.0,1.0,0.0
25%,5000.0,2006.0,4.0,70000.0,1.0,19.0
50%,9000.0,2011.0,6.0,113000.0,1.0,33.0
75%,16839.0,2014.0,8.0,155000.0,1.0,53.0
max,375000.0,2019.0,12.0,990000.0,1.0,271.0


'Missing Dataset info:'

price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color         0
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

'Statistical Summary:'

Unnamed: 0,price,model_year,cylinders,odometer
count,51525.0,47906.0,46265.0,43633.0
mean,12132.46492,2009.75047,6.125235,115553.461738
std,10040.803015,6.282065,1.66036,65094.611341
min,1.0,1908.0,3.0,0.0
25%,5000.0,2006.0,4.0,70000.0
50%,9000.0,2011.0,6.0,113000.0
75%,16839.0,2014.0,8.0,155000.0
max,375000.0,2019.0,12.0,990000.0
