In [36]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import joblib
import json

In [37]:
# import files
df = pd.read_csv(r"SBS_Processed_Datasets/03_feature_engineered_output.csv")

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7327 entries, 0 to 7326
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   city                  7327 non-null   object 
 1   transmission          7327 non-null   object 
 2   owner_no              7327 non-null   int64  
 3   brand                 7327 non-null   object 
 4   model                 7327 non-null   object 
 5   model_year            7327 non-null   int64  
 6   variant_name          7327 non-null   object 
 7   price                 7327 non-null   float64
 8   registered_year       7327 non-null   int64  
 9   fuel_type             7327 non-null   object 
 10  kms_driven            7327 non-null   float64
 11  mileage_kmpl          7327 non-null   float64
 12  engine_cc             7327 non-null   int64  
 13  car_age               7327 non-null   int64  
 14  model_age             7327 non-null   int64  
 15  registration_lag     

In [39]:
df.describe()


Unnamed: 0,owner_no,model_year,price,registered_year,kms_driven,mileage_kmpl,engine_cc,car_age,model_age,registration_lag,normalized_model_age,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
count,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0,7327.0
mean,1.38092,2016.213321,626085.0,2016.279924,60405.79,19.595483,1341.502525,7.720076,7.786679,0.066603,0.849895,0.550435,0.007643,0.305992,626085.0,7227.724674
std,0.647962,3.887329,356122.0,3.898677,44386.06,3.484031,381.741087,3.898677,3.887329,0.32325,0.085593,0.097866,0.087095,0.460858,208520.2,5903.800914
min,1.0,1985.0,28000.0,1985.0,101.0,7.08,624.0,1.0,1.0,0.0,0.105263,0.198876,0.0,0.0,73333.33,50.5
25%,1.0,2014.0,372000.0,2014.0,33431.0,17.38,1197.0,5.0,5.0,0.0,0.833333,0.488202,0.0,0.0,503886.0,4566.25
50%,1.0,2017.0,551000.0,2017.0,56672.0,19.16,1199.0,7.0,7.0,0.0,0.875,0.538202,0.0,0.0,575329.5,6581.333333
75%,2.0,2019.0,800000.0,2019.0,80000.0,21.74,1497.0,10.0,10.0,0.0,0.909091,0.610674,0.0,1.0,621019.8,9095.306818
max,5.0,2023.0,1775000.0,2023.0,2000022.0,35.6,5000.0,39.0,39.0,16.0,0.975,1.0,1.0,1.0,1725000.0,250002.75


In [40]:
df.head(2)

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,mileage_kmpl,engine_cc,car_age,model_age,registration_lag,normalized_model_age,kms_bins,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000.0,23.1,998,9,9,0,0.9,High,0.648876,0,1,503886.023982,12000.0
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706.0,17.0,1497,6,6,0,0.857143,Low,0.477528,0,1,538652.329749,4672.285714


In [41]:
# Check the possible encoding methods
categorical_dtypes = df.select_dtypes(include=['object']).columns.to_list() # take only categorical columns dtypes

enc_recommendation = {
    "column_names" : [],
    "unique_values_count": [],
    "recommended_encoding": []
}

for col in categorical_dtypes:
    unique_count = df[col].nunique()
    enc_recommendation["column_names"].append(col)
    enc_recommendation["unique_values_count"].append(unique_count)

    if unique_count <= 10:
        encoding_type = "one-hot-encoding"
    elif unique_count <= 100:
        encoding_type = "label-encoding"
    else:
        encoding_type = "target-encoding"

    enc_recommendation["recommended_encoding"].append(encoding_type)

encoding_recommended_df = pd.DataFrame(enc_recommendation)

In [42]:
encoding_recommended_df.sort_values(by=['recommended_encoding'])

Unnamed: 0,column_names,unique_values_count,recommended_encoding
2,brand,32,label-encoding
0,city,6,one-hot-encoding
1,transmission,2,one-hot-encoding
5,fuel_type,4,one-hot-encoding
6,kms_bins,5,one-hot-encoding
3,model,234,target-encoding
4,variant_name,1739,target-encoding


ONE HOT ENCODING

In [43]:
df.columns

Index(['city', 'transmission', 'owner_no', 'brand', 'model', 'model_year',
       'variant_name', 'price', 'registered_year', 'fuel_type', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'kms_bins', 'mileage_normalized',
       'high_mileage', 'multiple_owners', 'brand_popularity', 'kms_per_year'],
      dtype='object')

In [44]:
# One-Hot Encoding for 'city', 'transmission' and 'fuel_type'
df = pd.get_dummies(df, columns=['city', 'transmission', 'fuel_type','kms_bins'], prefix=['city', 'transmission', 'fuel_type','kms'], drop_first=False)

LABEL ENCODING

In [45]:
from sklearn.preprocessing import LabelEncoder
import joblib
#BRAND ENCODING
# Create and fit the LabelEncoder
label_encoder_brand = LabelEncoder()
df["brand_encoded"] = label_encoder_brand.fit_transform(df["brand"])  # Fit and transform the column
df.drop(columns=['brand'], inplace=True)  # Drop the original column after encoding

# Save the fitted LabelEncoder
joblib.dump(label_encoder_brand, 'PKL_Files/brand.pkl')  # Save the correctly fitted encoder
print("LabelEncoder for brand saved successfully!")


LabelEncoder for brand saved successfully!


In [46]:
df['brand_encoded'].head()

0    18
1     6
2    28
3     9
4    18
Name: brand_encoded, dtype: int64

TARGET ENCODING

In [47]:
# Step 1: Calculate the target mean for `variant_name` and `model`
variant_target_mean = df.groupby('variant_name')['price'].mean()
model_target_mean = df.groupby('model')['price'].mean()

# Step 2: Map the calculated mean to create encoded columns
df['variant_name_encoded'] = df['variant_name'].map(variant_target_mean)
df['model_encoded'] = df['model'].map(model_target_mean)

# Step 3: Drop the original columns as they’re now encoded
df = df.drop(['variant_name', 'model'], axis=1)

# Step 4: Convert mappings to dictionaries for saving
variant_name_mapping = variant_target_mean.to_dict()
model_mapping = model_target_mean.to_dict()

# Step 5: Save the mappings as PKL files
joblib.dump(variant_name_mapping, 'PKL_Files/variant_name_mapping.pkl')
joblib.dump(model_mapping, 'PKL_Files/model_mapping.pkl')

['PKL_Files/model_mapping.pkl']

In [48]:
df[['variant_name_encoded', 'model_encoded']].head()

Unnamed: 0,variant_name_encoded,model_encoded
0,424566.346883,440245.033113
1,787214.285714,683109.090909
2,470370.37037,521147.368421
3,433500.0,432471.698113
4,797000.0,837482.758621


In [49]:
df.columns

Index(['owner_no', 'model_year', 'price', 'registered_year', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'mileage_normalized', 'high_mileage',
       'multiple_owners', 'brand_popularity', 'kms_per_year', 'city_bangalore',
       'city_chennai', 'city_delhi', 'city_hyderabad', 'city_jaipur',
       'city_kolkata', 'transmission_automatic', 'transmission_manual',
       'fuel_type_cng', 'fuel_type_diesel', 'fuel_type_lpg',
       'fuel_type_petrol', 'kms_High', 'kms_Low', 'kms_Moderate', 'kms_Unused',
       'kms_Very_Low', 'brand_encoded', 'variant_name_encoded',
       'model_encoded'],
      dtype='object')

FEATURE SCALING

In [50]:
df.head(2)

Unnamed: 0,owner_no,model_year,price,registered_year,kms_driven,mileage_kmpl,engine_cc,car_age,model_age,registration_lag,normalized_model_age,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year,city_bangalore,city_chennai,city_delhi,city_hyderabad,city_jaipur,city_kolkata,transmission_automatic,transmission_manual,fuel_type_cng,fuel_type_diesel,fuel_type_lpg,fuel_type_petrol,kms_High,kms_Low,kms_Moderate,kms_Unused,kms_Very_Low,brand_encoded,variant_name_encoded,model_encoded
0,3,2015,400000.0,2015,120000.0,23.1,998,9,9,0,0.9,0.648876,0,1,503886.023982,12000.0,True,False,False,False,False,False,False,True,False,False,False,True,True,False,False,False,False,18,424566.346883,440245.033113
1,2,2018,811000.0,2018,32706.0,17.0,1497,6,6,0,0.857143,0.477528,0,1,538652.329749,4672.285714,True,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,False,6,787214.285714,683109.090909


In [51]:

print(df.columns)      # Check the new column names

Index(['owner_no', 'model_year', 'price', 'registered_year', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'mileage_normalized', 'high_mileage',
       'multiple_owners', 'brand_popularity', 'kms_per_year', 'city_bangalore',
       'city_chennai', 'city_delhi', 'city_hyderabad', 'city_jaipur',
       'city_kolkata', 'transmission_automatic', 'transmission_manual',
       'fuel_type_cng', 'fuel_type_diesel', 'fuel_type_lpg',
       'fuel_type_petrol', 'kms_High', 'kms_Low', 'kms_Moderate', 'kms_Unused',
       'kms_Very_Low', 'brand_encoded', 'variant_name_encoded',
       'model_encoded'],
      dtype='object')


In [53]:
# from sklearn.preprocessing import StandardScaler

# Step 1: Identify the columns to scale
columns_to_scale = [
    'owner_no', 'kms_driven', 'mileage_kmpl', 'engine_cc', 'car_age', 
    'model_age', 'registration_lag', 'normalized_model_age', 
    'mileage_normalized', 'brand_popularity', 'kms_per_year'
]

# Step 2: Create a copy of the DataFrame to avoid overwriting
df_scaled = df.copy()

# Step 3: Apply StandardScaler to the necessary columns
scaler = StandardScaler()
df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])

# Step 4: Verify the scaled columns
print(df_scaled[columns_to_scale].describe())

# Optional: Save the scaled DataFrame if needed
df_scaled.to_csv(r'SBS_Processed_Datasets/04_1_scaled_dataframe.csv', index=False)
df.to_csv(r'SBS_Processed_Datasets/04_2_Non_scaled_dataframe.csv', index=False)


           owner_no    kms_driven  mileage_kmpl     engine_cc       car_age  \
count  7.327000e+03  7.327000e+03  7.327000e+03  7.327000e+03  7.327000e+03   
mean   1.299478e-16 -7.758075e-17 -7.215010e-16  2.405003e-16 -2.521375e-17   
std    1.000068e+00  1.000068e+00  1.000068e+00  1.000068e+00  1.000068e+00   
min   -5.879142e-01 -1.358735e+00 -3.592488e+00 -1.879681e+00 -1.723799e+00   
25%   -5.879142e-01 -6.077726e-01 -6.359400e-01 -3.785613e-01 -6.977398e-01   
50%   -5.879142e-01 -8.412652e-02 -1.250026e-01 -3.733217e-01 -1.847102e-01   
75%    9.554923e-01  4.414798e-01  6.155695e-01  4.073653e-01  5.848341e-01   
max    5.585712e+00  4.370175e+01  4.593992e+00  9.584368e+00  8.023762e+00   

          model_age  registration_lag  normalized_model_age  \
count  7.327000e+03      7.327000e+03          7.327000e+03   
mean  -3.879038e-17     -5.721581e-17         -4.674240e-16   
std    1.000068e+00      1.000068e+00          1.000068e+00   
min   -1.745966e+00     -2.060558e-0

In [55]:
df.columns 

Index(['owner_no', 'model_year', 'price', 'registered_year', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'mileage_normalized', 'high_mileage',
       'multiple_owners', 'brand_popularity', 'kms_per_year', 'city_bangalore',
       'city_chennai', 'city_delhi', 'city_hyderabad', 'city_jaipur',
       'city_kolkata', 'transmission_automatic', 'transmission_manual',
       'fuel_type_cng', 'fuel_type_diesel', 'fuel_type_lpg',
       'fuel_type_petrol', 'kms_High', 'kms_Low', 'kms_Moderate', 'kms_Unused',
       'kms_Very_Low', 'brand_encoded', 'variant_name_encoded',
       'model_encoded'],
      dtype='object')