# Car Sales Advertisement Analysis Dashboard

- This analysis explores a dataset of car sales advertisements to uncover trends and insights. 

- Its aim is to preprocess the data, address any of its missing values, and identify key pattterns in it.

In [79]:
# Importing neccesary packages, dataframes, and dataset for the analysis

import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np
from scipy.stats import zscore

pio.renderers.default = "vscode"

df = pd.read_csv("../vehicles_us.csv")

In [80]:
# Performing initial assessment of data

df.info()
df.describe()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [81]:
# Checking columns in the dataset

df.columns

Index(['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel',
       'odometer', 'transmission', 'type', 'paint_color', 'is_4wd',
       'date_posted', 'days_listed'],
      dtype='object')

# Data Preprocessing

**Model Year** fills in missing values by grouping the data by model and using the medial year for each group: 

**Cylinders** fill in any missing cylinder values by grouping by model and using the median.

**Odometer** fills in any missing odometer readings by grouping by model year or a combination of year and model, then using the median or mean of each group.

In [98]:
# Check required columns

required_columns = ['model', 'model_year', 'cylinders', 'odometer']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing columns: {missing_columns}")

In [99]:
# Check for empty groups before applying transformation

grouped = df.groupby(['model_year', 'model'])['odometer']

In [100]:
# Fill NaNs with a default value if needed

df['odometer'].fillna(df['odometer'].median(), inplace=True)

In [101]:
# Fill missing odometer values

df['odometer'] = grouped.transform(lambda x: x.fillna(x.median() if not x.empty else np.nan))

In [102]:
# If there are still missing values, fill with the overall median

overall_median = df['odometer'].median()

df['odometer'] = df['odometer'].fillna(overall_median)

In [103]:
# Verify there's no NaN left

print(df['odometer'].isnull().sum())  # Should be 0

0


In [104]:
# Fill missing model_year values using median

df['model_year'] = df.groupby('model')['model_year'].transform(lambda x: x.fillna(x.median()))

In [105]:
# Fill missing cylinders values

df['cylinders'] = df.groupby('model')['cylinders'].transform(lambda x: x.fillna(x.median()))

In [106]:
# Fill missing odometer values

df['odometer'] = df.groupby(['model_year', 'model'])['odometer'].transform(lambda x: x.fillna(x.median()))

In [107]:
# Check initial data distribution

print(df[['price', 'model_year']].describe())

               price    model_year
count   51525.000000  51525.000000
mean    12132.464920   2009.793954
std     10040.803015      6.099296
min         1.000000   1908.000000
25%      5000.000000   2007.000000
50%      9000.000000   2011.000000
75%     16839.000000   2014.000000
max    375000.000000   2019.000000


In [110]:
# Remove outliers by filtering the DataFrame based on Z-scores 

df_filtered = df[(np.abs(zscore(df['price'])) < 3) & (np.abs(zscore(df['model_year'])) < 3)]

In [111]:
# Verify results

print(df.describe())

              price    model_year     cylinders       odometer   is_4wd  \
count  50382.000000  50382.000000  50382.000000   50382.000000  24903.0   
mean   11651.771466   2010.025336      6.090985  116462.809486      1.0   
std     8581.544704      5.221046      1.655600   61983.583479      0.0   
min        1.000000   1992.000000      3.000000       0.000000      1.0   
25%     4999.000000   2007.000000      4.000000   73800.000000      1.0   
50%     8995.000000   2011.000000      6.000000  115022.000000      1.0   
75%    16000.000000   2014.000000      8.000000  153585.500000      1.0   
max    42000.000000   2019.000000     12.000000  990000.000000      1.0   

        days_listed  
count  50382.000000  
mean      39.539181  
std       28.208868  
min        0.000000  
25%       19.000000  
50%       33.000000  
75%       53.000000  
max      271.000000  


In [112]:
# If the filtered data is empty or too small, adjust thresholds

if df_filtered.empty:
    print("Filtered data is empty. Consider adjusting Z-score thresholds.")

# Data Cleaning 

Removing outliers in the model year and price enhances the clarity of the scatterplots.

In [113]:
# Removing outliers based on Z-scores

df = df[(np.abs(zscore(df['price'])) < 3) & (np.abs(zscore(df['model_year'])) < 3)]

In [114]:
fig = px.histogram(df, x="price")
fig.show()

In [115]:
# Re-checking columns in the dataset

df.columns

Index(['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel',
       'odometer', 'transmission', 'type', 'paint_color', 'is_4wd',
       'date_posted', 'days_listed'],
      dtype='object')

In [116]:
fig = px.scatter(df, x="price", y="model_year")
fig.show()

# Conclusion

- After preprocessing, trends in car prices and model years were identified.

- The cleaned dataset reveals that newer models typically have higher prices, with specific models showing consistent pricing patterns. 