In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
df=pd.read_csv(r'Test_set.csv')


In [53]:
print("""NaN Values in Data
--------------------------""")
print(df.isna().sum())
print(
f"""-------- Data shape-------
        {df.shape}
--------------------------""")


NaN Values in Data
--------------------------
Unnamed: 0             0
Make                   0
Model                  0
Price                  0
Year                   0
Kilometer              0
Fuel Type              0
Transmission           0
Location               0
Color                  0
Owner                  0
Seller Type            0
Engine                15
Max Power             15
Max Torque            15
Drivetrain            27
Length                10
Width                 10
Height                10
Seating Capacity      10
Fuel Tank Capacity    19
dtype: int64
-------- Data shape-------
        (359, 21)
--------------------------


In [54]:
df.columns

Index(['Unnamed: 0', 'Make', 'Model', 'Price', 'Year', 'Kilometer',
       'Fuel Type', 'Transmission', 'Location', 'Color', 'Owner',
       'Seller Type', 'Engine', 'Max Power', 'Max Torque', 'Drivetrain',
       'Length', 'Width', 'Height', 'Seating Capacity', 'Fuel Tank Capacity'],
      dtype='object')

In [55]:
df["Engine_cc"] = df["Engine"].str.extract(r"(\d+\.?\d*)").astype(float)
df["Power_bhp"] = df["Max Power"].str.extract(r"(\d+\.?\d*)").astype(float)
df["Power_rpm"] = df["Max Power"].str.extract(r"@ (\d+)").astype(float)
df["Torque_Nm"] = df["Max Torque"].str.extract(r"(\d+\.?\d*)").astype(float)
df["Torque_rpm"] = df["Max Torque"].str.extract(r"@ (\d+)").astype(float)
df["Base_Model"] = df["Model"].str.split().str[0]

cols_to_drop = ["Engine", "Max Power", "Max Torque"]
df.drop(columns=cols_to_drop, inplace=True)


In [56]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# ***Feature Engineering***

In [57]:
df["Power_weight_ratio"]=df["Power_bhp"]/df["Height"]
df['power_torque_ratio']=df["Power_bhp"]/df["Torque_Nm"]
df['estimated_acceleration'] = 1 / (df['Power_bhp'] * 0.75 + df['Torque_Nm'] * 0.25)
df['torque_to_engine'] = df['Torque_Nm'] / df['Engine_cc']
df['engine_stress'] = df['Power_rpm'] / df['Torque_rpm']
df['specific_power'] = df['Power_bhp'] / (df['Engine_cc'] / 1000)   # per liter
df['specific_torque'] = df['Torque_Nm'] / (df['Engine_cc'] / 1000)
df['rpm_efficiency'] = df['Power_rpm'] - df['Torque_rpm']
df['norm_power'] = df['Power_bhp'] / df['Power_rpm']
df['norm_torque'] = df['Torque_Nm'] / df['Torque_rpm']
df['performance_index'] = (
    df['Power_bhp'] * 0.6 +
    df['Torque_Nm'] * 0.4 +
    (df['Power_rpm'] / df['Torque_rpm']) * 10
)


In [58]:
df.columns

Index(['Unnamed: 0', 'Make', 'Model', 'Price', 'Year', 'Kilometer',
       'Fuel Type', 'Transmission', 'Location', 'Color', 'Owner',
       'Seller Type', 'Drivetrain', 'Length', 'Width', 'Height',
       'Seating Capacity', 'Fuel Tank Capacity', 'Engine_cc', 'Power_bhp',
       'Power_rpm', 'Torque_Nm', 'Torque_rpm', 'Base_Model',
       'Power_weight_ratio', 'power_torque_ratio', 'estimated_acceleration',
       'torque_to_engine', 'engine_stress', 'specific_power',
       'specific_torque', 'rpm_efficiency', 'norm_power', 'norm_torque',
       'performance_index'],
      dtype='object')

In [59]:
df.drop(columns=['Engine_cc','Power_bhp','Power_rpm','Torque_Nm','Torque_rpm'],inplace=True)

df.columns

Index(['Unnamed: 0', 'Make', 'Model', 'Price', 'Year', 'Kilometer',
       'Fuel Type', 'Transmission', 'Location', 'Color', 'Owner',
       'Seller Type', 'Drivetrain', 'Length', 'Width', 'Height',
       'Seating Capacity', 'Fuel Tank Capacity', 'Base_Model',
       'Power_weight_ratio', 'power_torque_ratio', 'estimated_acceleration',
       'torque_to_engine', 'engine_stress', 'specific_power',
       'specific_torque', 'rpm_efficiency', 'norm_power', 'norm_torque',
       'performance_index'],
      dtype='object')

In [62]:
df.head()

Unnamed: 0.1,Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,...,power_torque_ratio,estimated_acceleration,torque_to_engine,engine_stress,specific_power,specific_torque,rpm_efficiency,norm_power,norm_torque,performance_index
0,1700,Maruti Suzuki,Ritz Vdi BS-IV,275000,2013,85785,Diesel,Manual,Ambala Cantt,Silver,...,0.384211,0.00978,0.152244,2.0,58.49359,152.24359,2000.0,0.01825,0.095,139.8
1,1701,Mercedes-Benz,SLK-Class SLK 200 K,2400000,2010,34105,Petrol,Automatic,Mumbai,Grey,...,0.6875,0.005442,0.13363,2.285714,91.870824,133.63029,2250.0,0.04125,0.137143,217.857143
2,1702,Honda,City 1.5 V MT,269000,2010,73000,Petrol,Manual,Delhi,Silver,...,0.808219,0.008,0.097528,2.285714,78.824315,97.52839,2250.0,0.0295,0.083429,152.057143
3,1703,Hyundai,Elite i20 Asta 1.2 [2016-2017],650000,2016,100000,Petrol,Manual,Hyderabad,Grey,...,0.713043,0.01108,0.096074,1.5,68.504595,96.073517,2000.0,0.013667,0.02875,110.2
4,1704,Hyundai,Santro Sportz CNG [2018-2020],545000,2019,43000,CNG,Manual,Mumbai,Silver,...,0.690476,0.015504,0.077348,1.222222,53.406998,77.348066,1000.0,0.010545,0.018667,80.622222


In [63]:
df["Make"].value_counts()

Make
Maruti Suzuki    80
Hyundai          53
Toyota           32
Audi             25
BMW              24
Mercedes-Benz    22
Honda            22
Mahindra         20
Tata             10
Ford              9
Renault           9
Volkswagen        9
Land Rover        8
Skoda             7
Jeep              6
Kia               5
Volvo             3
Chevrolet         3
MG                2
Porsche           2
MINI              2
Jaguar            2
Rolls-Royce       1
Lexus             1
Maserati          1
Datsun            1
Name: count, dtype: int64

In [64]:
Luxury_Brand="Mercedes-Benz|Audi|BMW|Maserati|Rolls-Royce|Lamborghini|Ferrari|MINI|Porsche|Land Rover|Jaguar|Lexus"
premuim_Brand="Toyota|Kia|MG|Jeep|Volkswagen|Skoda|Honda"
economy_Brand = "Maruti Suzuki|Hyundai|Mahindra|Tata|Ford|Renault|Nissan|Datsun|Chevrolet|Mitsubishi|Ssangyong|Isuzu|Fiat|Volvo"
cond=(
(df["Make"].str.contains(Luxury_Brand)),
(df["Make"].str.contains(premuim_Brand)),
df['Make'].str.contains(economy_Brand)
)
cate=["Luxury_Brand","Premium_Brand","Economy_Brand"]
df["Brand_Category"]=np.select(cond,cate,default="other")

In [65]:
df.head()

Unnamed: 0.1,Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,...,estimated_acceleration,torque_to_engine,engine_stress,specific_power,specific_torque,rpm_efficiency,norm_power,norm_torque,performance_index,Brand_Category
0,1700,Maruti Suzuki,Ritz Vdi BS-IV,275000,2013,85785,Diesel,Manual,Ambala Cantt,Silver,...,0.00978,0.152244,2.0,58.49359,152.24359,2000.0,0.01825,0.095,139.8,Economy_Brand
1,1701,Mercedes-Benz,SLK-Class SLK 200 K,2400000,2010,34105,Petrol,Automatic,Mumbai,Grey,...,0.005442,0.13363,2.285714,91.870824,133.63029,2250.0,0.04125,0.137143,217.857143,Luxury_Brand
2,1702,Honda,City 1.5 V MT,269000,2010,73000,Petrol,Manual,Delhi,Silver,...,0.008,0.097528,2.285714,78.824315,97.52839,2250.0,0.0295,0.083429,152.057143,Premium_Brand
3,1703,Hyundai,Elite i20 Asta 1.2 [2016-2017],650000,2016,100000,Petrol,Manual,Hyderabad,Grey,...,0.01108,0.096074,1.5,68.504595,96.073517,2000.0,0.013667,0.02875,110.2,Economy_Brand
4,1704,Hyundai,Santro Sportz CNG [2018-2020],545000,2019,43000,CNG,Manual,Mumbai,Silver,...,0.015504,0.077348,1.222222,53.406998,77.348066,1000.0,0.010545,0.018667,80.622222,Economy_Brand


In [66]:
le = LabelEncoder()

for col in df.select_dtypes(include=['object']):
        encoded=le.fit_transform(df[col])
        encoded=((encoded.astype(float))+1)/10
        df[col]=encoded


In [67]:
df.drop(["Color","Location"],axis=1,inplace=True)

In [68]:
df.to_csv("Testset.csv",index=False)