In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('data/car_details.csv')

In [None]:
df.describe()

# Data Cleaning

In [None]:
df.duplicated()
df.duplicated().sum()

In [None]:
df.dropna(subset=["Engine", "Max Power", "Max Torque", "Drivetrain", "Seating Capacity", "Fuel Tank Capacity"], inplace = True)

In [None]:
df.isna().sum()

In [None]:
import re

def extract_base_model(model_str):
    model_str = re.sub(r'\[.*?\]', '', model_str)
    model_str = model_str.strip()
    words = model_str.split()
    return ' '.join(words[:2]) if len(words) >= 2 else words[0]

df['Base Model'] = df['Model'].apply(extract_base_model)
df['Base Model'] = df['Base Model'].str.title().str.strip()
df.drop("Model",axis=1)

# FEATURE CREATION

In [None]:
df["Base Model"]

In [None]:
df['Car Age']=2025 - df['Year']

In [None]:
def age_segment(Age):
    if 0 <= Age <= 6:
        return 'New'
    elif 7 <= Age <= 11:
        return 'Mid-Age'
    elif 12 <= Age <= 16:
        return 'Old'
    else:
        return 'Not for Re-Sale'

df['Age Segment'] = df['Car Age'].apply(age_segment)

In [None]:
def price_segment(price):
    if price < 100000:
        return 'Very Low'
    elif 100000 <= price <= 5000000:
        return 'Low to Mid Range'
    elif 5100000 <= price <= 10000000:
        return 'High Range'
    else:
        return 'Luxury'

df['Price Segment'] = df['Price'].apply(price_segment)

print(df['Price Segment'].value_counts())

In [None]:
df[df['Owner'] == 'UnRegistered Car'].index

In [None]:
df["Owner"].value_counts()

In [None]:
df.rename(columns={"Make":"Company"},inplace=True)

In [None]:
df

In [None]:
df.info()

In [None]:
df['Fuel Type'] = df['Fuel Type'].replace({
    'CNG + CNG': 'CNG',
    'Petrol + CNG': 'Petrol+CNG'
})

In [None]:
df['Fuel Type'].value_counts()

In [None]:
df['Engine CC']=df['Engine'].str.split().str[0].astype(int)
df.drop('Engine',axis=1,inplace=True)

In [None]:
df['max_power_cleaned'] = df['Max Power'].str.extract(r'(\d+\.?\d*)')
df['max_power_cleaned'] = pd.to_numeric(df['max_power_cleaned'], errors='coerce')

df['torque_value'] = df['Max Torque'].str.extract(r'(\d+\.?\d*)')
df['is_kgm'] = df['Max Torque'].str.contains('kgm', case=False, na=False)
df['torque_value'] = pd.to_numeric(df['torque_value'], errors='coerce')
df['torque_value'] = np.where(df['is_kgm'], df['torque_value'] * 9.8, df['torque_value'])
df['Max Torque'] = df['torque_value']
df.drop(columns=['torque_value', 'is_kgm'], inplace=True)


df = df.drop(columns=['Max Power'])
df.rename(columns={
   'max_power_cleaned': 'Max Power',
    'torque_nm': 'Max Torque'
    },inplace=True)

In [None]:
df.drop("Year",axis=1)

# OUTLIER REMOVAL

In [None]:
df["Car Age"].unique()

In [None]:
df.drop(df[df["Car Age"]==37].index,inplace=True)

In [None]:
df["Car Age"].unique()

# ENCODING


In [None]:
df.isna().sum()

In [None]:
owner_map = {
    'First': 1,
    'Second': 2,
    'Third': 3
}
df['Owner'] = df['Owner'].map(owner_map)

trans_map = {
    'Manual': 1,
    'Automatic': 2
}
df['Transmission'] = df['Transmission'].map(trans_map)

fuel_dummies = pd.get_dummies(df['Fuel Type'], prefix='Fuel', drop_first=True, dtype=int)
df = pd.concat([df.drop('Fuel Type', axis=1), fuel_dummies], axis=1)

In [None]:
df

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.to_csv("data/cleaned_cars.csv")

In [None]:
df.columns

# PRICE PREDICTION

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Features and Target
features = [
    'Kilometer', 'Fuel_Diesel', 'Fuel_Hybrid', 'Fuel_LPG', 'Fuel_Petrol', 'Fuel_Petrol+CNG',
    'Transmission', 'Owner', 'Seating Capacity', 'Engine CC', 'Max Power', 'Car Age', 'Fuel Tank Capacity'
]
X = df[features]
y = np.log1p(df['Price'])  # log transform the target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Final Random Forest model with best params
model = RandomForestRegressor(
    n_estimators=158,
    max_depth=15,
    min_samples_split=4,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42
)

# Fit model
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)

print("MAE:", round(mean_absolute_error(y_test, y_pred), 3))
print("MSE:", round(mean_squared_error(y_test, y_pred), 3))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, y_pred)), 3))
print("R² Score:", round(r2_score(y_test, y_pred), 3))

In [None]:
import joblib

joblib.dump(model, 'model/car_price_model.pkl')