In [157]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [158]:
df = pd.read_csv(r"..\data\raw\train.csv")

In [159]:
df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [160]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  str    
 3   Manufacturer      19237 non-null  str    
 4   Model             19237 non-null  str    
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  str    
 7   Leather interior  19237 non-null  str    
 8   Fuel type         19237 non-null  str    
 9   Engine volume     19237 non-null  str    
 10  Mileage           19237 non-null  str    
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  str    
 13  Drive wheels      19237 non-null  str    
 14  Doors             19237 non-null  str    
 15  Wheel             19237 non-null  str    
 16  Color             19237 non-null  str    
 17  Airb

In [161]:
X = df.drop(['ID','Price','Prod. year','Wheel','Doors'],axis=1)
y = df.iloc[:,1:2]

In [162]:
X['Mileage'] = X['Mileage'].str.split().str[0].astype(int)

In [163]:
X['Levy'] = X['Levy'].replace('-',0)
X['Levy'] = X['Levy'].astype(int)

In [164]:
X['is_turbo'] = X['Engine volume'].str.contains('Turbo', case=False, na=False).astype(int)

In [165]:
X['Engine volume'] = (
    X['Engine volume']
    .str.extract(r'(\d+\.?\d*)')
    .astype(float)
)

In [166]:
X

Unnamed: 0,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Color,Airbags,is_turbo
0,1399,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,Silver,12,0
1,1018,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,Black,8,0
2,0,HONDA,FIT,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,Black,2,0
3,862,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,White,0,0
4,446,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,Silver,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,0,MERCEDES-BENZ,CLK 200,Coupe,Yes,CNG,2.0,300000,4.0,Manual,Rear,Silver,5,1
19233,831,HYUNDAI,Sonata,Sedan,Yes,Petrol,2.4,161600,4.0,Tiptronic,Front,Red,8,0
19234,836,HYUNDAI,Tucson,Jeep,Yes,Diesel,2.0,116365,4.0,Automatic,Front,Grey,4,0
19235,1288,CHEVROLET,Captiva,Jeep,Yes,Diesel,2.0,51258,4.0,Automatic,Front,Black,4,0


In [167]:
cat_col= X.select_dtypes(include='object')

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_col= X.select_dtypes(include='object')


In [168]:
OHE_cols = []
MED_cols = []
FED_cols = []

for col in cat_col:
    nunique = X[col].nunique()
    
    if nunique <= 5:
        OHE_cols.append(col)
    elif 6 <= nunique <= 15:
        MED_cols.append(col)
    else:
        FED_cols.append(col)

In [169]:
print("One-Hot Encoding Columns:", OHE_cols)
print("Mean/Target Encoding Columns:", MED_cols)
print("Frequency Encoding Columns:", FED_cols)

One-Hot Encoding Columns: ['Leather interior', 'Gear box type', 'Drive wheels']
Mean/Target Encoding Columns: ['Category', 'Fuel type']
Frequency Encoding Columns: ['Manufacturer', 'Model', 'Color']


### One-Hot Encoding

In [170]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

ohe_array = enc.fit_transform(X[OHE_cols]).toarray()

ohe_cols = enc.get_feature_names_out(OHE_cols)

ohe_df = pd.DataFrame(ohe_array, columns=ohe_cols, index=X.index)

X = pd.concat([X.drop(columns=OHE_cols), ohe_df], axis=1)

### Target Encoding

In [173]:
for col in MED_cols:
    mean_map = df.groupby(col)['Price'].mean()
    X[col] = X[col].map(mean_map)

### Frequency Encoding

In [171]:
for col in FED_cols:
    freq = X[col].value_counts()
    X[col] = X[col].map(freq)