In [53]:
import pandas as pd
from datetime import datetime

In [54]:
df = pd.read_csv(r"SBS_Processed_Datasets/02_cleaned_output.csv")

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7327 entries, 0 to 7326
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   city             7327 non-null   object 
 1   transmission     7327 non-null   object 
 2   owner_no         7327 non-null   int64  
 3   brand            7327 non-null   object 
 4   model            7327 non-null   object 
 5   model_year       7327 non-null   int64  
 6   variant_name     7327 non-null   object 
 7   price            7327 non-null   float64
 8   registered_year  7327 non-null   int64  
 9   fuel_type        7327 non-null   object 
 10  kms_driven       7327 non-null   float64
 11  mileage_kmpl     7327 non-null   float64
 12  engine_cc        7327 non-null   int64  
dtypes: float64(3), int64(4), object(6)
memory usage: 744.3+ KB


In [56]:
df.head()

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,mileage_kmpl,engine_cc
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000.0,23.1,998
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706.0,17.0,1497
2,bangalore,manual,1,tata,tata tiago,2018,1.2 revotron xz,585000.0,2018,petrol,11949.0,23.84,1199
3,bangalore,manual,1,hyundai,hyundai xcent,2014,1.2 kappa s option,462000.0,2014,petrol,17794.0,19.1,1197
4,bangalore,manual,1,maruti,maruti sx4 s cross,2015,ddis 200 zeta,790000.0,2015,diesel,60000.0,23.65,1248


FEATURE ENGINEERING 

In [57]:
#STEP_1 ADD AGE RELATED FEATURES

# Get the current year
current_year = datetime.now().year

# Calculate car age
df['car_age'] = current_year - df['registered_year']
df['model_age'] = current_year - df['model_year']
df['registration_lag']  = df['registered_year'] - df['model_year'] 
df['normalized_model_age']  = df['car_age'] / (df['model_age']  + 1)


In [58]:
#STEP_2 Add price_per_km

# Calculate price per kilometer driven
df['price_per_km'] = df['price'] / df['kms_driven']
df['price_per_km'] = df['price_per_km'].replace([float('inf'), -float('inf')], 0)  # Replace infinities with 0
df['price_per_km'] = df['price_per_km'].fillna(0)

In [59]:
#STEP_3 Create mileage_normalized

# Calculate normalized mileage
df['mileage_normalized'] = df['mileage_kmpl'] / df['mileage_kmpl'].max()

In [60]:
#STEP_4 High Mileage Flag

# high_mileage (binary feature)
high_mileage_threshold = 150000  # Define the threshold for high mileage
df['high_mileage'] = (df['kms_driven'] > high_mileage_threshold).astype(int)

In [61]:
#STEP_5 Multiple Owners

# Flag for cars with multiple previous owners
df['multiple_owners'] = (df['owner_no'] > 1).astype(int)


In [62]:
#STEP_6 Brand Popularity

# Compute average price per brand
brand_avg_price = df.groupby('brand')['price'].mean().to_dict()

# Map the average price to each row
df['brand_popularity'] = df['brand'].map(brand_avg_price)

# Example: Adding brand popularity column
print(df[['brand', 'brand_popularity']].head())


     brand  brand_popularity
0   maruti     503886.023982
1     ford     538652.329749
2     tata     617492.501340
3  hyundai     575329.531686
4   maruti     503886.023982


In [63]:
#STEP_7 kms_per_year
df['kms_per_year'] = df['kms_driven'] / (df['car_age'] + 1)


In [64]:
df.head(2)

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,...,car_age,model_age,registration_lag,normalized_model_age,price_per_km,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,...,9,9,0,0.9,3.333333,0.648876,0,1,503886.023982,12000.0
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,...,6,6,0,0.857143,24.796673,0.477528,0,1,538652.329749,4672.285714


In [65]:
df.describe()

Unnamed: 0,owner_no,model_year,price,registered_year,kms_driven,mileage_kmpl,engine_cc,car_age,model_age,registration_lag,normalized_model_age,price_per_km,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
count,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0
mean,1.38092,2016.213321,626085.0,2016.279924,60405.79,19.595483,1341.502525,7.720076,7.786679,0.066603,0.849895,24.173851,0.550435,0.007643,0.305992,626085.0,7227.724674
std,0.647962,3.887329,356122.0,3.898677,44386.06,3.484031,381.741087,3.898677,3.887329,0.32325,0.085593,165.732636,0.097866,0.087095,0.460858,208520.2,5903.800914
min,1.0,1985.0,28000.0,1985.0,101.0,7.08,624.0,1.0,1.0,0.0,0.105263,0.248711,0.198876,0.0,0.0,73333.33,50.5
25%,1.0,2014.0,372000.0,2014.0,33431.0,17.38,1197.0,5.0,5.0,0.0,0.833333,5.416667,0.488202,0.0,0.0,503886.0,4566.25
50%,1.0,2017.0,551000.0,2017.0,56672.0,19.16,1199.0,7.0,7.0,0.0,0.875,10.045467,0.538202,0.0,0.0,575329.5,6581.333333
75%,2.0,2019.0,800000.0,2019.0,80000.0,21.74,1497.0,10.0,10.0,0.0,0.909091,20.643265,0.610674,0.0,1.0,621019.8,9095.306818
max,5.0,2023.0,1775000.0,2023.0,2000022.0,35.6,5000.0,39.0,39.0,16.0,0.975,13069.306931,1.0,1.0,1.0,1725000.0,250002.75


In [66]:
df.head(2)

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,...,car_age,model_age,registration_lag,normalized_model_age,price_per_km,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,...,9,9,0,0.9,3.333333,0.648876,0,1,503886.023982,12000.0
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,...,6,6,0,0.857143,24.796673,0.477528,0,1,538652.329749,4672.285714


In [67]:
df.columns

Index(['city', 'transmission', 'owner_no', 'brand', 'model', 'model_year',
       'variant_name', 'price', 'registered_year', 'fuel_type', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'price_per_km', 'mileage_normalized',
       'high_mileage', 'multiple_owners', 'brand_popularity', 'kms_per_year'],
      dtype='object')

In [68]:
# Export it
df.to_csv(r"SBS_Processed_Datasets\03_feature_engineered_output.csv",index=False)