**Car_clustering data preprocessing**

In [32]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler

Load dataset

In [26]:
# Load the dataset
df = pd.read_csv('car_price.csv')

# Display basic info
print(f"Dataset shape: {df.shape}")
print("Columns:", df.columns.tolist())

# Show a few example rows
df.head()


Dataset shape: (205, 26)
Columns: ['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'price']


Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


Data Cleaning

In [27]:
df['CarName'] = df['CarName'].replace({
    'maxda rx3': 'mazda rx3',
    'maxda glc deluxe': 'mazda glc deluxe',
    'porcshe panamera': 'porsche panamera',
    'toyuota tercel': 'toyota tercel',
    'vokswagen rabbit': 'volkswagen rabbit'
})


One-hot 

In [28]:
df['CarBrand'] = df['CarName'].str.split().str[0].str.lower()
df = pd.get_dummies(df, columns=['CarBrand', 'fueltype',
    'aspiration',
    'doornumber',
    'carbody',
    'drivewheel',
    'enginelocation',
    'enginetype',
    'cylindernumber',
    'fuelsystem'], dtype=int)
df.head()


Unnamed: 0,car_ID,symboling,CarName,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,...,cylindernumber_twelve,cylindernumber_two,fuelsystem_1bbl,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,1,3,alfa-romero giulia,88.6,168.8,64.1,48.8,2548,130,3.47,...,0,0,0,0,0,0,0,1,0,0
1,2,3,alfa-romero stelvio,88.6,168.8,64.1,48.8,2548,130,3.47,...,0,0,0,0,0,0,0,1,0,0
2,3,1,alfa-romero Quadrifoglio,94.5,171.2,65.5,52.4,2823,152,2.68,...,0,0,0,0,0,0,0,1,0,0
3,4,2,audi 100 ls,99.8,176.6,66.2,54.3,2337,109,3.19,...,0,0,0,0,0,0,0,1,0,0
4,5,2,audi 100ls,99.4,176.6,66.4,54.3,2824,136,3.19,...,0,0,0,0,0,0,0,1,0,0


Z-score

In [29]:
columns_to_check = [
    'wheelbase', 'enginesize', 'stroke', 'compressionratio',
    'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'price'
]
for col in columns_to_check:
    if col in df.columns:
        z_scores = np.abs(zscore(df[col]))
        median_value = df[col].median()
        df.loc[z_scores > 3, col] = median_value
        print(f"Processed column: {col}")
df.head()

Processed column: wheelbase
Processed column: enginesize
Processed column: stroke
Processed column: compressionratio
Processed column: horsepower
Processed column: peakrpm
Processed column: citympg
Processed column: highwaympg
Processed column: price


Unnamed: 0,car_ID,symboling,CarName,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,...,cylindernumber_twelve,cylindernumber_two,fuelsystem_1bbl,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,1,3,alfa-romero giulia,88.6,168.8,64.1,48.8,2548,130,3.47,...,0,0,0,0,0,0,0,1,0,0
1,2,3,alfa-romero stelvio,88.6,168.8,64.1,48.8,2548,130,3.47,...,0,0,0,0,0,0,0,1,0,0
2,3,1,alfa-romero Quadrifoglio,94.5,171.2,65.5,52.4,2823,152,2.68,...,0,0,0,0,0,0,0,1,0,0
3,4,2,audi 100 ls,99.8,176.6,66.2,54.3,2337,109,3.19,...,0,0,0,0,0,0,0,1,0,0
4,5,2,audi 100ls,99.4,176.6,66.4,54.3,2824,136,3.19,...,0,0,0,0,0,0,0,1,0,0


Normalization

In [33]:
print("Columns:", df.columns.tolist())
num_cols = [
    'symboling', 'wheelbase','carlength','carwidth','carheight',
    'curbweight','enginesize','boreratio','stroke','compressionratio',
    'horsepower','peakrpm','citympg','highwaympg','price'
]

scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head()


Columns: ['car_ID', 'symboling', 'CarName', 'wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'price', 'CarBrand_alfa-romero', 'CarBrand_audi', 'CarBrand_bmw', 'CarBrand_buick', 'CarBrand_chevrolet', 'CarBrand_dodge', 'CarBrand_honda', 'CarBrand_isuzu', 'CarBrand_jaguar', 'CarBrand_mazda', 'CarBrand_mercury', 'CarBrand_mitsubishi', 'CarBrand_nissan', 'CarBrand_peugeot', 'CarBrand_plymouth', 'CarBrand_porcshce', 'CarBrand_porsche', 'CarBrand_renault', 'CarBrand_saab', 'CarBrand_subaru', 'CarBrand_toyota', 'CarBrand_toyouta', 'CarBrand_volkswagen', 'CarBrand_volvo', 'CarBrand_vw', 'fueltype_diesel', 'fueltype_gas', 'aspiration_std', 'aspiration_turbo', 'doornumber_four', 'doornumber_two', 'carbody_convertible', 'carbody_hardtop', 'carbody_hatchback', 'carbody_sedan', 'carbody_wagon', 'drivewheel_4wd', 'drivewheel_fwd', 'drivewheel_rwd', 'enginelocation_front', 'engine

Unnamed: 0,car_ID,symboling,CarName,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,...,cylindernumber_twelve,cylindernumber_two,fuelsystem_1bbl,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,1,1.0,alfa-romero giulia,0.068966,0.413433,0.316667,0.083333,0.411171,0.398844,0.664286,...,0,0,0,0,0,0,0,1,0,0
1,2,1.0,alfa-romero stelvio,0.068966,0.413433,0.316667,0.083333,0.411171,0.398844,0.664286,...,0,0,0,0,0,0,0,1,0,0
2,3,0.6,alfa-romero Quadrifoglio,0.272414,0.449254,0.433333,0.383333,0.517843,0.526012,0.1,...,0,0,0,0,0,0,0,1,0,0
3,4,0.8,audi 100 ls,0.455172,0.529851,0.491667,0.541667,0.329325,0.277457,0.464286,...,0,0,0,0,0,0,0,1,0,0
4,5,0.8,audi 100ls,0.441379,0.529851,0.508333,0.541667,0.518231,0.433526,0.464286,...,0,0,0,0,0,0,0,1,0,0


Save processed data

In [34]:
df.to_csv("car_price_preprocessed.csv", index=False)