In [142]:
# Importing required libraries for data handling, preprocessing, and modelingimport kagglehub
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder , OneHotEncoder , StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np

In [118]:
# Uncomment these lines to download the dataset from Kaggle.
# path = kagglehub.dataset_download("milanvaddoriya/old-car-price-prediction")
# print(f"path to file is : "+path)

In [120]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv("car_price.csv")

In [122]:
# Function to convert price from 'Lakh' or 'Crore' format to numerical rupees
def convert_price_to_rupiah(price_str):
    price_str = price_str.strip()
    if 'Lakh' in price_str:
        return float(price_str.replace(' Lakh', '')) * 1e5
    elif 'Crore' in price_str:
        return float(price_str.replace(' Crore', '')) * 1e7
    else:
        return None  # یا 0 یا np.nan، بسته به انتخابت

# Function to convert kilometers string to integer
def convert_kms(kms_str):
    return int(kms_str.replace(',', '').replace(' kms', '').strip())
    
# Function to convert engine size string to integer (cc)
def convert_engine(engine_str):
    return int(engine_str.replace(' cc', '').strip())

In [124]:
# Apply data cleaning and extract useful features from raw columns
df['car_prices_in_rupee'] = df['car_prices_in_rupee'].apply(convert_price_to_rupiah)
df['kms_driven'] = df['kms_driven'].apply(convert_kms)
df['engine'] = df['engine'].apply(convert_engine)

# Extract brand, model, and variant information from car name
df['brand'] = df['car_name'].apply(lambda x : x.split()[0])
df['model'] = df['car_name'].apply(lambda x : x.split()[1])
df['Variant'] = df['car_name'].apply(lambda x : x.split()[2] if  x.split()[2] else  None)

# Convert seat column to integer (extract first word)
df['Seats'] = df['Seats'].apply(lambda x: int(x.split()[0]) if x.split() else None)


In [126]:
# Encode the 'ownership' categorical column into numeric format using OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
df['ownership_encoded'] = ordinal_encoder.fit_transform(df[['ownership']])

In [128]:
# One-hot encode selected categorical features and add the encoded columns to the main DataFrame
hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
hot_encoded = hot_encoder.fit_transform(df[['fuel_type', 'transmission' , 'brand', 'model', 'Variant']])
hot_encoded_df = pd.DataFrame(hot_encoded, columns=hot_encoder.get_feature_names_out(['fuel_type', 'transmission' , 'brand', 'model', 'Variant']))
df = pd.concat([df, hot_encoded_df], axis=1)

In [130]:
# Normalize numerical features using StandardScaler for better model performance
scaler = StandardScaler()
df[['car_prices_in_rupee', 'kms_driven', 'engine','manufacture','Seats']] = scaler.fit_transform(df[['car_prices_in_rupee', 'kms_driven', 'engine','manufacture','Seats']])

In [132]:
# Handle missing values by imputing the most frequent value in each column
column_names = df.columns
imputer = SimpleImputer(strategy='most_frequent')
df = imputer.fit_transform(df)
df = pd.DataFrame(df,columns=column_names)

In [134]:
# Drop unnecessary or already encoded categorical columns from the DataFrame
df = df.drop(columns=['fuel_type', 'transmission','ownership','Unnamed: 0' , 'brand','model','Variant' ,'car_name'])

In [136]:
# Splitting the dataset into features (X) and target variable (y) for machine learning.
X = df.drop(columns=['car_prices_in_rupee'])
y = df['car_prices_in_rupee']

In [138]:
# Reducing the feature set with PCA to retain 95% variance and adding the target variable back to the transformed data.
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
columns = [f'PC{i+1}' for i in range(X_pca.shape[1])]
X_pca_df = pd.DataFrame(X_pca, columns=columns)
X_pca_df['car_prices_in_rupee'] = y.reset_index(drop=True)

In [83]:
# Splitting the reduced dataset (X_pca_df) and target variable (y) into training and testing sets with 80-20 split.
X = X_pca_df
X_train, X_test, y_train, y_test = train_test_split(X_pca_df,y, test_size=0.2, random_state=42)

In [146]:
# Training an MLP regressor and evaluating it on the test set.
model = MLPRegressor(hidden_layer_sizes=(32, 16), activation='relu', max_iter=500, random_state=42)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# mae = mean_absolute_error(y_test, y_pred)
model = MLPRegressor(hidden_layer_sizes=(32, 16), activation='relu', max_iter=500, random_state=42)
cv_scores = cross_val_score(model, X_pca_df, y, cv=5, scoring='neg_mean_absolute_error')
print("MAE in fold: ", -cv_scores)  
print("Ave MAE: ", -np.mean(cv_scores))
print("std MAE: ", np.std(cv_scores))


MAE in fold:  [0.03793883 0.03871534 0.03535333 0.03635826 0.03495839]
Ave MAE:  0.036664829824405235
std MAE:  0.0014528336353137848


In [148]:
# Cross-validation with 5 folds, using R² for evaluation
cv_scores_r2 = cross_val_score(model, X_pca_df, y, cv=5, scoring='r2')
print("R² for each fold: ", cv_scores_r2)
print("Mean R²: ", np.mean(cv_scores_r2))
print("Standard deviation of R²: ", np.std(cv_scores_r2))


R² for each fold:  [0.99671498 0.99679764 0.99784996 0.99751809 0.99715159]
Mean R²:  0.9972064514364959
Standard deviation of R²:  0.00042962893152326704


In [152]:
# Cross-validation with 5 folds, using MSE for evaluation
cv_scores_mse = cross_val_score(model, X_pca_df, y, cv=5, scoring='neg_mean_squared_error')
print("MSE for each fold: ", -cv_scores_mse)  # Negate the negative values
print("Mean MSE: ", -np.mean(cv_scores_mse))  # Mean of MSE values
print("Standard deviation of MSE: ", np.std(-cv_scores_mse))  # Standard deviation of MSE

MSE for each fold:  [0.00300864 0.00310794 0.00224548 0.00252532 0.00261763]
Mean MSE:  0.0027009996262474306
Standard deviation of MSE:  0.00031797930802489007
