In [14]:
# for data manip
import numpy as np
import pandas as pd
import scipy.sparse as sp

# for preprocessing
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from skrub import TableVectorizer

# modeling
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor
from pygam import LinearGAM, s, f
from pygam.terms import TermList
from sklearn.neural_network import MLPRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.svm import SVR
from pyearth import Earth
from sklearn.neighbors import KNeighborsRegressor

# for pipeline
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin

# for utilities
from functools import reduce
import operator

In [2]:
df = pd.read_csv('Used_Car_Price_Prediction.csv')
df

Unnamed: 0,car_name,yr_mfr,fuel_type,kms_run,sale_price,city,times_viewed,body_type,transmission,variant,...,total_owners,broker_quote,original_price,car_rating,ad_created_on,fitness_certificate,emi_starts_from,booking_down_pymnt,reserved,warranty_avail
0,maruti swift,2015,petrol,8063,386399,noida,18715,hatchback,manual,lxi opt,...,2,397677,404177.0,great,2021-04-04T07:09:18.583,True,8975,57960,False,False
1,maruti alto 800,2016,petrol,23104,265499,noida,2676,hatchback,manual,lxi,...,1,272935,354313.0,great,2021-03-22T14:07:32.833,True,6167,39825,False,False
2,hyundai grand i10,2017,petrol,23402,477699,noida,609,hatchback,manual,sports 1.2 vtvt,...,1,469605,,great,2021-03-20T05:36:31.311,True,11096,71655,False,False
3,maruti swift,2013,diesel,39124,307999,noida,6511,hatchback,manual,vdi,...,1,294262,374326.0,great,2021-01-21T12:59:19.299,True,7154,46200,False,False
4,hyundai grand i10,2015,petrol,22116,361499,noida,3225,hatchback,manual,magna 1.2 vtvt,...,1,360716,367216.0,great,2021-04-01T13:33:40.733,True,8397,54225,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7395,honda amaze,2018,diesel,53486,604299,ghaziabad,2756,sedan,,1.5 v cvt i-dtec,...,1,630810,787750.0,great,2021-02-07T08:05:30.443,True,14036,90645,True,False
7396,maruti ignis,2018,petrol,8854,562599,chennai,640,hatchback,manual,delta 1.2 k12,...,1,549440,,great,2021-03-31T10:21:56.289,True,13068,84390,False,False
7397,honda amaze,2015,petrol,46300,400499,pune,795,sedan,manual,1.2 smt i vtec,...,1,383419,,great,2021-03-04T12:40:38.652,True,9303,60075,True,False
7398,maruti alto k10,2016,petrol,27245,284099,new delhi,1155,hatchback,manual,lxi,...,1,286515,369885.0,great,2021-03-16T13:31:39.766,True,6599,42615,False,False


In [3]:
print(df.isna().sum().sum())

4723


In [4]:
df.isna().sum()

car_name                  0
yr_mfr                    0
fuel_type                 0
kms_run                   0
sale_price                0
city                      0
times_viewed              0
body_type               103
transmission            556
variant                   0
assured_buy               0
registered_city          10
registered_state         10
is_hot                    0
rto                       0
source                  126
make                      0
model                     0
car_availability        620
total_owners              0
broker_quote              0
original_price         3280
car_rating                9
ad_created_on             1
fitness_certificate       8
emi_starts_from           0
booking_down_pymnt        0
reserved                  0
warranty_avail            0
dtype: int64

In [5]:
df = df.drop(['original_price','broker_quote', 'booking_down_pymnt', 'emi_starts_from'], axis=1) 

In [6]:
df.columns

Index(['car_name', 'yr_mfr', 'fuel_type', 'kms_run', 'sale_price', 'city',
       'times_viewed', 'body_type', 'transmission', 'variant', 'assured_buy',
       'registered_city', 'registered_state', 'is_hot', 'rto', 'source',
       'make', 'model', 'car_availability', 'total_owners', 'car_rating',
       'ad_created_on', 'fitness_certificate', 'reserved', 'warranty_avail'],
      dtype='object')

In [7]:
full_df_num_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[full_df_num_cols].corr()

print(correlation_matrix)

                yr_mfr   kms_run  sale_price  times_viewed  total_owners
yr_mfr        1.000000 -0.395842    0.518973      0.059617     -0.301315
kms_run      -0.395842  1.000000   -0.104727     -0.114795      0.133000
sale_price    0.518973 -0.104727    1.000000      0.091579     -0.131306
times_viewed  0.059617 -0.114795    0.091579      1.000000     -0.001361
total_owners -0.301315  0.133000   -0.131306     -0.001361      1.000000


In [8]:
X = df.drop(columns=["sale_price"])
y = df["sale_price"] 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=26       
)

In [11]:
pipe = Pipeline([
    ("tv", TableVectorizer()),
    ("knn", KNNImputer(n_neighbors=5)),
    ("catboost", CatBoostRegressor(
        depth=8, learning_rate=0.05, n_estimators=800,
        loss_function="RMSE", verbose=False
    ))
])

In [12]:
pipe.fit(X_train, y_train)

In [13]:
y_pred = pipe.predict(X_test)

In [14]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.3f}, R²: {r2:.3f}")

RMSE: 69184.340, R²: 0.943


In [15]:
param_grid = {
    "catboost__depth": [8, 10],
    "catboost__learning_rate": [0.03, 0.05],
    "catboost__n_estimators": [500],
    "catboost__l2_leaf_reg": [1, 3, 5]
}

In [16]:
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",   
    cv=5,             
    n_jobs=-1, 
    verbose=False
)

In [17]:
warnings.filterwarnings(
    "ignore",
    message="Found unknown categories in columns",
    category=UserWarning,
    module="sklearn.preprocessing._encoders"
)

warnings.filterwarnings(
    "ignore",
    message="Downcasting behavior in `replace` is deprecated",
    category=FutureWarning,
    module="skrub._clean_null_strings"
)

In [None]:
grid.fit(X_train, y_train)

In [19]:
print("Best RMSE:", -grid.best_score_)
print("Best parameters:", grid.best_params_)

Best RMSE: 84630.04177327888
Best parameters: {'catboost__depth': 8, 'catboost__l2_leaf_reg': 1, 'catboost__learning_rate': 0.05, 'catboost__n_estimators': 500}


In [20]:
best_pipe_cat = grid.best_estimator_
y_pred = best_pipe_cat.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.3f}, R²: {r2:.3f}")

RMSE: 68917.859, R²: 0.943


In [21]:
num_cols = X.select_dtypes(include=[np.number]).columns
cat_cols = X.select_dtypes(exclude=[np.number]).columns

In [22]:
num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scale", StandardScaler())
])

In [23]:
cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore",
                          min_frequency=0.01,
                          sparse_output=True))
])

In [24]:
pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],)

In [25]:
pipe = Pipeline([
    ("pre", pre),
    ("ridge", Ridge(solver="sag", random_state=26))
])

In [26]:
param_grid = {
    "pre__num__poly__degree": [1, 2], 
    "ridge__alpha": [1.0, 10.0, 100.0]
}

In [27]:
search = GridSearchCV(
    pipe, param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=26),
    scoring="neg_root_mean_squared_error",
    n_jobs=1,   
    verbose=1
)

In [28]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [29]:
print("Best RMSE:", -search.best_score_)
print("Best params:", search.best_params_)

Best RMSE: 144558.7630434305
Best params: {'pre__num__poly__degree': 2, 'ridge__alpha': 10.0}


In [30]:
best_pipe = search.best_estimator_
y_pred = best_pipe.predict(X_test)

In [31]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.3f}, R²: {r2:.3f}")

RMSE: 156899.543, R²: 0.707


In [30]:
pre = ColumnTransformer([
    ('num', Pipeline([
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', StandardScaler())
    ]), selector(dtype_include=np.number)),
    ('cat', Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ]), selector(dtype_exclude=np.number))
])

In [31]:
X_tr = pre.fit_transform(X_train)
X_te = pre.transform(X_test)
y_tr = np.asarray(y_train).ravel()
n_cont = pre.named_transformers_['num'].n_features_in_
n_cat  = X_tr.shape[1] - n_cont

In [33]:
term_pieces = [s(i, n_splines=10) for i in range(n_cont)] + \
              [f(n_cont + j) for j in range(n_cat)]
terms = reduce(operator.add, term_pieces)
gam = LinearGAM(terms).gridsearch(
    X_tr, y_tr,
    lam=np.logspace(-4, 4, 10),
)


[38;2;0;255;0m100%[39m [38;2;0;255;0m(10 of 10)[39m |########################| Elapsed Time: 2:59:49 Time:  2:59:497:31


In [35]:
cat_start = n_cont
cat_end = n_cont + n_cat

X_te[:, cat_start:cat_end] = np.nan_to_num(
    X_te[:, cat_start:cat_end], nan=0.0, posinf=0.0, neginf=0.0
)

mins = X_tr[:, cat_start:cat_end].min(axis=0)
maxs = X_tr[:, cat_start:cat_end].max(axis=0)
X_te[:, cat_start:cat_end] = np.clip(X_te[:, cat_start:cat_end], mins, maxs)

y_pred = gam.predict(X_te)


In [38]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.3f}, R²: {r2:.3f}")

RMSE: 93490.406, R²: 0.896


In [18]:
num_sel = selector(dtype_include=np.number)
cat_sel = selector(dtype_exclude=np.number)


num_pipe = Pipeline([
    ("impute_num", KNNImputer(n_neighbors=5)),
    ("scale", StandardScaler())
])

cat_pipe = Pipeline([
    ("impute_cat", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

tab = ColumnTransformer([
    ("num", num_pipe, num_sel),
    ("cat", cat_pipe, cat_sel),
])

mlp = MLPRegressor(
    hidden_layer_sizes=(256, 128),
    activation="relu",
    solver="adam",
    learning_rate_init=1e-3,
    alpha=1e-3,            
    batch_size=256,
    max_iter=1000,
    early_stopping=True,
    n_iter_no_change=20,
    validation_fraction=0.15,
    random_state=26
)

pipe = Pipeline([
    ("tab", tab),
    ("reg", TransformedTargetRegressor(
        regressor=mlp,
        transformer=StandardScaler()
    ))
])

In [19]:
pipe.fit(X_train, y_train)

In [20]:
y_pred = pipe.predict(X_test)

In [21]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.3f}, R²: {r2:.3f}")

RMSE: 71143.759, R²: 0.940


In [29]:
num_sel = selector(dtype_include=np.number)
cat_sel = selector(dtype_exclude=np.number)

num_pipe = Pipeline([
    ("impute_num", KNNImputer(n_neighbors=5)),
    ("scale", StandardScaler())
])

cat_pipe = Pipeline([
    ("impute_cat", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

tab = ColumnTransformer([
    ("num", num_pipe, num_sel),
    ("cat", cat_pipe, cat_sel),
])

to_dense = FunctionTransformer(
    lambda X: X.toarray() if sp.issparse(X) else X, accept_sparse=True
)

svr = SVR(kernel="rbf", C=10.0, epsilon=0.1, gamma="scale")

pipe_svr = Pipeline([
    ("tab", tab),
    ("dense", to_dense),  
    ("reg", TransformedTargetRegressor(
        regressor=svr,
        transformer=StandardScaler()
    ))
])

In [30]:
pipe_svr.fit(X_train, y_train)

In [31]:
y_pred = pipe_svr.predict(X_test)

In [32]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.3f}, R²: {r2:.3f}")

RMSE: 81217.178, R²: 0.922


In [10]:
pipe = Pipeline([
    ("tv", TableVectorizer()),
    ("knn", KNNImputer(n_neighbors=5)),
    ("mars", Earth(
        max_degree=2,        
        minspan_alpha=0.5,   
        endspan_alpha=0.5,   
        penalty=3,       
        enable_pruning=True
    ))
])

In [11]:
pipe.fit(X_train, y_train)

  pruning_passer.run()
  coef, resid = np.linalg.lstsq(B, weighted_y[:, i])[0:2]


In [12]:
y_pred = pipe.predict(X_test)

In [13]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.3f}, R²: {r2:.3f}")

RMSE: 112428.256, R²: 0.850


In [15]:
knn_pipe = Pipeline([
    ("tv", TableVectorizer()),                  
    ("impute", KNNImputer(n_neighbors=5)),      
    ("scale", StandardScaler()),                
    ("knn", KNeighborsRegressor(
        n_neighbors=10,
        weights="distance",     
        metric="minkowski", p=2
    ))
])

In [17]:
knn_pipe.fit(X_train, y_train)

In [18]:
y_pred = knn_pipe.predict(X_test)

In [19]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.3f}, R²: {r2:.3f}")

RMSE: 165061.952, R²: 0.676
