In [26]:
import pandas as pd

In [27]:
df = pd.read_csv(r"SBS_Processed_Datasets/01_structured_output.csv")

In [28]:
df.head(2)

Unnamed: 0,city,transmission,ownerNo,oem,model,modelYear,variantName,price,Registration Year,Fuel Type,Kms Driven,Engine Displacement,Mileage
0,Bangalore,Manual,3,Maruti,Maruti Celerio,2015,VXI,₹ 4 Lakh,2015,Petrol,"1,20,000 Kms",998 cc,23.1 kmpl
1,Bangalore,Manual,2,Ford,Ford Ecosport,2018,1.5 Petrol Titanium BSIV,₹ 8.11 Lakh,Feb 2018,Petrol,"32,706 Kms",1497 cc,17 kmpl


In [29]:
# Rename_columns
df.rename(columns={"ownerNo": "owner_no",
                   "oem": "brand",
                   "modelYear": "model_year",
                   "variantName": "variant_name",
                   "Registration Year": "registered_year",
                   "Fuel Type": "fuel_type",
                   "Kms Driven": "kms_driven",
                   "Engine Displacement": "engine_cc",
                   "Transmission": "transmission",
                   "Year of Manufacture": "manufacture_year",
                   "Mileage": "mileage_kmpl"},inplace=True)

In [30]:
# Strip_spaces_in_column_names
df.columns = df.columns.str.strip()

In [31]:
# Strip all white/trailing spaces & if string convert it to lowercase for consistency
df = df.apply(lambda col: col.str.strip().str.lower() if col.dtype == 'object' else col,axis=0)

In [32]:
df.isnull().sum()

city                 0
transmission         0
owner_no             0
brand                0
model                0
model_year           0
variant_name         0
price                0
registered_year    104
fuel_type            0
kms_driven           4
engine_cc            8
mileage_kmpl       574
dtype: int64

In [33]:
df.head()

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,engine_cc,mileage_kmpl
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,₹ 4 lakh,2015,petrol,"1,20,000 kms",998 cc,23.1 kmpl
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,₹ 8.11 lakh,feb 2018,petrol,"32,706 kms",1497 cc,17 kmpl
2,bangalore,manual,1,tata,tata tiago,2018,1.2 revotron xz,₹ 5.85 lakh,sept 2018,petrol,"11,949 kms",1199 cc,23.84 kmpl
3,bangalore,manual,1,hyundai,hyundai xcent,2014,1.2 kappa s option,₹ 4.62 lakh,dec 2014,petrol,"17,794 kms",1197 cc,19.1 kmpl
4,bangalore,manual,1,maruti,maruti sx4 s cross,2015,ddis 200 zeta,₹ 7.90 lakh,2015,diesel,"60,000 kms",1248 cc,23.65 kmpl


In [34]:
def convert_price_to_numerical_value(x):
    price = x.replace("₹","").replace(",","").lower().strip()

    if "thousand" in price or "thousands" in price:
        return float(price.replace("thousand" , "").replace("thousands", "").strip()) * 1_000
    elif "lakh" in price or "lakhs" in price:
        return float(price.replace("lakh","").replace("lakhs","").strip()) * 10_0000
    elif "crore" in price or "crores" in price:
        return float(price.replace("crore","").replace("crores","").strip()) * 10_00_000
    else:
        return float(price)

In [35]:
# remove 'kms' and make it numerical standard
df["kms_driven"] = df["kms_driven"].replace({"kms":"" , ",":""},regex=True).str.strip()

# engine displacement column remove unwanted strings
df["engine_cc"] = df["engine_cc"].str.replace("cc","").str.strip().astype(float)

# remove "kmph" in mileage_kmph column
df["mileage_kmpl"] = df["mileage_kmpl"].replace({"kmpl":"" , "km/kg": ""}, regex=True).str.strip().astype(float)

# remove unwanted strings and keep only year
df["registered_year"] = df["registered_year"].str.extract(r'\b(\d{4})\b').astype(float) #float will handle Nan value

# convert price colums values to its numerical format
df["price"] = df["price"].apply(lambda x: convert_price_to_numerical_value(x))

In [36]:
df[(df["kms_driven"].isnull()) | (df["engine_cc"].isnull())] # try to fill those values using grouping and mean

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,engine_cc,mileage_kmpl
487,bangalore,manual,3,maruti,maruti gypsy,1995,mg410w st,400000.0,,petrol,10000.0,,
490,bangalore,manual,3,maruti,maruti gypsy,1995,mg410w ht,400000.0,,petrol,10000.0,,
574,bangalore,automatic,1,mahindra,mahindra e2o plus,2017,p6,550000.0,2017.0,electric,20000.0,,110.0
5510,hyderabad,manual,1,mercedes-benz,mercedes-benz c-class,2002,180 elegance,225000.0,2002.0,petrol,135000.0,,
5683,hyderabad,manual,3,tata,tata indigo,2007,lx,20161.0,2007.0,diesel,,1405.0,17.0
6430,jaipur,manual,1,maruti,maruti wagon r,2015,vxi bs iv,350000.0,2015.0,petrol,,998.0,20.51
8856,bangalore,manual,3,maruti,maruti gypsy,1995,mg410w st,400000.0,,petrol,10000.0,,
8859,bangalore,manual,3,maruti,maruti gypsy,1995,mg410w ht,400000.0,,petrol,10000.0,,
8943,bangalore,automatic,1,mahindra,mahindra e2o plus,2017,p6,550000.0,2017.0,electric,20000.0,,110.0
13879,hyderabad,manual,1,mercedes-benz,mercedes-benz c-class,2002,180 elegance,225000.0,2002.0,petrol,135000.0,,


In [37]:
# try filling Nan engine_cc values
df["engine_cc"] = df.groupby(['brand','model','model_year','fuel_type'])['engine_cc'].transform(lambda x: x.fillna(x.mean())) # only one value filled so dropping left Nan rows
# drop rows where kms driven is Nan
df.dropna(subset=["kms_driven","engine_cc"],inplace=True)

# drop rows where registered_year are null
df.dropna(subset=["registered_year"],inplace=True) # dropped cos it skew data when compared to age and kms driven

In [38]:
#drop duplicate records
df.drop_duplicates(inplace=True)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8198 entries, 0 to 8368
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   city             8198 non-null   object 
 1   transmission     8198 non-null   object 
 2   owner_no         8198 non-null   int64  
 3   brand            8198 non-null   object 
 4   model            8198 non-null   object 
 5   model_year       8198 non-null   int64  
 6   variant_name     8198 non-null   object 
 7   price            8198 non-null   float64
 8   registered_year  8198 non-null   float64
 9   fuel_type        8198 non-null   object 
 10  kms_driven       8198 non-null   object 
 11  engine_cc        8198 non-null   float64
 12  mileage_kmpl     7924 non-null   float64
dtypes: float64(4), int64(2), object(7)
memory usage: 896.7+ KB


In [40]:
df.isnull().sum()

city                 0
transmission         0
owner_no             0
brand                0
model                0
model_year           0
variant_name         0
price                0
registered_year      0
fuel_type            0
kms_driven           0
engine_cc            0
mileage_kmpl       274
dtype: int64

In [41]:
# convert to numeric datatypes
df["engine_cc"] = pd.to_numeric(df["engine_cc"],errors="coerce").fillna(0).astype(int)
df["registered_year"] = df["registered_year"].fillna(0).astype(int)
df["kms_driven"] = pd.to_numeric(df["kms_driven"],errors="coerce").fillna(0).astype(int)
df['mileage_kmpl'] = pd.to_numeric(df['mileage_kmpl'], errors='coerce').fillna(0).astype(float)

# fill mileage_kmpl using imputation
df['mileage_kmpl'] = df.groupby(['brand','model','model_year','engine_cc','fuel_type'])['mileage_kmpl'].transform(lambda x: x.fillna(x.mean()))

In [42]:
df.head()

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,engine_cc,mileage_kmpl
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000,998,23.1
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706,1497,17.0
2,bangalore,manual,1,tata,tata tiago,2018,1.2 revotron xz,585000.0,2018,petrol,11949,1199,23.84
3,bangalore,manual,1,hyundai,hyundai xcent,2014,1.2 kappa s option,462000.0,2014,petrol,17794,1197,19.1
4,bangalore,manual,1,maruti,maruti sx4 s cross,2015,ddis 200 zeta,790000.0,2015,diesel,60000,1248,23.65


In [43]:
df.isnull().sum()

city               0
transmission       0
owner_no           0
brand              0
model              0
model_year         0
variant_name       0
price              0
registered_year    0
fuel_type          0
kms_driven         0
engine_cc          0
mileage_kmpl       0
dtype: int64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8198 entries, 0 to 8368
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   city             8198 non-null   object 
 1   transmission     8198 non-null   object 
 2   owner_no         8198 non-null   int64  
 3   brand            8198 non-null   object 
 4   model            8198 non-null   object 
 5   model_year       8198 non-null   int64  
 6   variant_name     8198 non-null   object 
 7   price            8198 non-null   float64
 8   registered_year  8198 non-null   int64  
 9   fuel_type        8198 non-null   object 
 10  kms_driven       8198 non-null   int64  
 11  engine_cc        8198 non-null   int64  
 12  mileage_kmpl     8198 non-null   float64
dtypes: float64(2), int64(5), object(6)
memory usage: 896.7+ KB


In [45]:
df.describe()

Unnamed: 0,owner_no,model_year,price,registered_year,kms_driven,engine_cc,mileage_kmpl
count,8198.0,8198.0,8198.0,8198.0,8198.0,8198.0,8198.0
mean,1.359966,2016.563796,920665.6,2016.630886,58982.85,1424.701879,18.63037
std,0.629937,3.786773,1060997.0,3.794191,74620.78,475.92015,5.269102
min,1.0,2002.0,28000.0,2002.0,101.0,0.0,0.0
25%,1.0,2014.0,400000.0,2014.0,30016.75,1197.0,16.82
50%,1.0,2017.0,606000.0,2017.0,53696.0,1248.0,18.9
75%,2.0,2019.0,950000.0,2019.0,80000.0,1498.0,21.4
max,5.0,2023.0,9600000.0,2023.0,5500000.0,5000.0,140.0


In [46]:
df["variant_name"].nunique()

2100

In [47]:
df.head()

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,engine_cc,mileage_kmpl
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000,998,23.1
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706,1497,17.0
2,bangalore,manual,1,tata,tata tiago,2018,1.2 revotron xz,585000.0,2018,petrol,11949,1199,23.84
3,bangalore,manual,1,hyundai,hyundai xcent,2014,1.2 kappa s option,462000.0,2014,petrol,17794,1197,19.1
4,bangalore,manual,1,maruti,maruti sx4 s cross,2015,ddis 200 zeta,790000.0,2015,diesel,60000,1248,23.65


In [48]:
# Export it
df.to_csv(r"SBS_Processed_Datasets/02_cleaned_output.csv", index=False)