In [7]:
import pandas as pd
import plotly.express as px

# --- Step 2: Load the dataset ---
df = pd.read_csv('../vehicles_us.csv')  # adjust the path if needed
df.head()


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [2]:
# --- Step 3: Basic overview ---
df.info()
df.describe()
df.isna().sum()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

In [3]:
# --- Step 4: Clean a bit (optional) ---
# Fill missing values or drop duplicates if needed
df = df.drop_duplicates()
df['model_year'] = df['model_year'].fillna(df['model_year'].median())
df['odometer'] = df['odometer'].fillna(df['odometer'].median())


In [1]:
!pip install nbformat





In [7]:
!pip install nbformat

Collecting nbformat
  Downloading nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting fastjsonschema>=2.15 (from nbformat)
  Downloading fastjsonschema-2.21.2-py3-none-any.whl.metadata (2.3 kB)
Downloading nbformat-5.10.4-py3-none-any.whl (78 kB)
Downloading fastjsonschema-2.21.2-py3-none-any.whl (24 kB)
Installing collected packages: fastjsonschema, nbformat
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [nbformat]
[1A[2KSuccessfully installed fastjsonschema-2.21.2 nbformat-5.10.4


In [2]:
import nbformat
print(nbformat.__version__)


5.10.4


In [5]:
fig_price = px.histogram(df, x='price', nbins=50, title='Distribution of Car Prices')
fig_price.show()

NameError: name 'px' is not defined

In [8]:
fig_price = px.histogram(df, x='price', nbins=50, title='Distribution of Car Prices')
fig_price.show()

In [9]:
# --- Step 6: Create scatter plots ---
fig_scatter_price_odometer = px.scatter(
    df, 
    x='odometer', 
    y='price', 
    color='type',
    title='Price vs Odometer (by Car Type)'
)
fig_scatter_price_odometer.show()

fig_year_price = px.scatter(
    df, 
    x='model_year', 
    y='price', 
    color='fuel',
    title='Car Price vs Model Year (by Fuel Type)'
)
fig_year_price.show()

In [10]:
# --- Step 7: Creative insight examples ---
# Average price by manufacturer
avg_price_by_make = df.groupby('model')['price'].mean().reset_index()
fig_make_price = px.bar(
    avg_price_by_make.sort_values('price', ascending=False).head(20),
    x='model',
    y='price',
    title='Top 20 Models by Average Price'
)
fig_make_price.show()


In [11]:
# --- Step 8: Summary ---
print("EDA complete — histograms and scatter plots created successfully!")

EDA complete — histograms and scatter plots created successfully!


In [None]:
## Scatter Plots

The following scatter plots show how key factors relate to car price. 
The first explores the relationship between mileage and price, colored by condition, 
while the second examines how car age (model year) and transmission type influence price distribution.


In [12]:
fig_scatter_odometer = px.scatter(
    df,
    x='odometer',
    y='price',
    color='condition',
    title='Price vs Odometer (by Condition)',
    labels={'odometer': 'Mileage (Odometer)', 'price': 'Price ($)'},
    opacity=0.6
)
fig_scatter_odometer.show()


In [13]:
fig_scatter_year = px.scatter(
    df,
    x='model_year',
    y='price',
    color='transmission',
    title='Model Year vs Price (by Transmission Type)',
    labels={'model_year': 'Model Year', 'price': 'Price ($)'},
    opacity=0.7
)
fig_scatter_year.show()
