In [76]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [64]:
# import files
df = pd.read_csv(r"SBS_Processed_Datasets/03_feature_engineered_output.csv")

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8198 entries, 0 to 8197
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city                8198 non-null   object 
 1   transmission        8198 non-null   object 
 2   owner_no            8198 non-null   int64  
 3   brand               8198 non-null   object 
 4   model               8198 non-null   object 
 5   model_year          8198 non-null   int64  
 6   variant_name        8198 non-null   object 
 7   price               8198 non-null   float64
 8   registered_year     8198 non-null   int64  
 9   fuel_type           8198 non-null   object 
 10  kms_driven          8198 non-null   int64  
 11  engine_cc           8198 non-null   int64  
 12  mileage_kmpl        8198 non-null   float64
 13  car_age             8198 non-null   int64  
 14  mileage_normalized  8198 non-null   float64
 15  brand_popularity    8198 non-null   float64
 16  price_

In [66]:
df.head()

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,engine_cc,mileage_kmpl,car_age,mileage_normalized,brand_popularity,price_per_km,age_group,high_mileage,multiple_owners
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000,998,23.1,9,2.566667,506751.98589,17316.017316,old,1,1
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706,1497,17.0,6,2.833333,732400.0,47705.882353,mid-age,0,1
2,bangalore,manual,1,tata,tata tiago,2018,1.2 revotron xz,585000.0,2018,petrol,11949,1199,23.84,6,3.973333,771862.830588,24538.590604,mid-age,1,0
3,bangalore,manual,1,hyundai,hyundai xcent,2014,1.2 kappa s option,462000.0,2014,petrol,17794,1197,19.1,10,1.91,607984.020283,24188.481675,old,0,0
4,bangalore,manual,1,maruti,maruti sx4 s cross,2015,ddis 200 zeta,790000.0,2015,diesel,60000,1248,23.65,9,2.627778,506751.98589,33403.805497,old,1,0


In [67]:
# Check the possible encoding methods
categorical_dtypes = df.select_dtypes(include=['object']).columns.to_list() # take only categorical columns dtypes

enc_recommendation = {
    "column_names" : [],
    "unique_values_count": [],
    "recommended_encoding": []
}

for col in categorical_dtypes:
    unique_count = df[col].nunique()
    enc_recommendation["column_names"].append(col)
    enc_recommendation["unique_values_count"].append(unique_count)

    if unique_count <= 10:
        encoding_type = "one-hot-encoding"
    elif unique_count <= 100:
        encoding_type = "label-encoding"
    else:
        encoding_type = "target-encoding"

    enc_recommendation["recommended_encoding"].append(encoding_type)

encoding_recommended_df = pd.DataFrame(enc_recommendation)

In [72]:
encoding_recommended_df.sort_values(by=['recommended_encoding'])

Unnamed: 0,column_names,unique_values_count,recommended_encoding
2,brand,33,label-encoding
0,city,6,one-hot-encoding
1,transmission,2,one-hot-encoding
5,fuel_type,5,one-hot-encoding
6,age_group,3,one-hot-encoding
3,model,312,target-encoding
4,variant_name,2100,target-encoding


ONE HOT ENCODING

In [73]:
one_hot_encoding_columns = ['city','transmission','fuel_type']  # age-group omitted here cos it shows quantitative relationship (use in label encoding)

df = pd.get_dummies(df,columns=one_hot_encoding_columns,drop_first=True)

In [75]:
df["age_group"]

0           old
1       mid-age
2       mid-age
3           old
4           old
         ...   
8193        new
8194        old
8195        old
8196        old
8197    mid-age
Name: age_group, Length: 8198, dtype: object

LABEL ENCODING

In [77]:
#from sklearn.preprocessing import LabelEncoder (used LabelEncoder from this module) 
#AGE
age_group_transform = {
    "new": 0,
    "mid-age": 1,
    "old": 2
}

df["age_group"] = df["age_group"].map(age_group_transform)

#BRAND
label_encoder = LabelEncoder()

df["brand"] = label_encoder.fit_transform(df["brand"])

In [None]:
# just to know which brand maps which numerical representation we write code below to understand for later use

{i: l for i,l in enumerate(label_encoder.classes_)}

{0: 'audi',
 1: 'bmw',
 2: 'chevrolet',
 3: 'citroen',
 4: 'datsun',
 5: 'fiat',
 6: 'ford',
 7: 'hindustan motors',
 8: 'honda',
 9: 'hyundai',
 10: 'isuzu',
 11: 'jaguar',
 12: 'jeep',
 13: 'kia',
 14: 'land rover',
 15: 'lexus',
 16: 'mahindra',
 17: 'mahindra renault',
 18: 'mahindra ssangyong',
 19: 'maruti',
 20: 'mercedes-benz',
 21: 'mg',
 22: 'mini',
 23: 'mitsubishi',
 24: 'nissan',
 25: 'opel',
 26: 'porsche',
 27: 'renault',
 28: 'skoda',
 29: 'tata',
 30: 'toyota',
 31: 'volkswagen',
 32: 'volvo'}

In [80]:
df.head()

Unnamed: 0,owner_no,brand,model,model_year,variant_name,price,registered_year,kms_driven,engine_cc,mileage_kmpl,...,city_chennai,city_delhi,city_hyderabad,city_jaipur,city_kolkata,transmission_manual,fuel_type_diesel,fuel_type_electric,fuel_type_lpg,fuel_type_petrol
0,3,19,maruti celerio,2015,vxi,400000.0,2015,120000,998,23.1,...,False,False,False,False,False,True,False,False,False,True
1,2,6,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,32706,1497,17.0,...,False,False,False,False,False,True,False,False,False,True
2,1,29,tata tiago,2018,1.2 revotron xz,585000.0,2018,11949,1199,23.84,...,False,False,False,False,False,True,False,False,False,True
3,1,9,hyundai xcent,2014,1.2 kappa s option,462000.0,2014,17794,1197,19.1,...,False,False,False,False,False,True,False,False,False,True
4,1,19,maruti sx4 s cross,2015,ddis 200 zeta,790000.0,2015,60000,1248,23.65,...,False,False,False,False,False,True,True,False,False,False
