In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.metrics import mean_squared_error,r2_score

In [3]:
df = pd.read_csv("Used_Bikes.csv")
df.dropna(inplace=True)

In [4]:
df.head()

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha


In [5]:
df['owner'].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth Owner Or More'], dtype=object)

In [6]:
df.isnull().sum()

bike_name     0
price         0
city          0
kms_driven    0
owner         0
age           0
power         0
brand         0
dtype: int64

In [7]:
df.columns

Index(['bike_name', 'price', 'city', 'kms_driven', 'owner', 'age', 'power',
       'brand'],
      dtype='object')

In [8]:
numeric_cols = ['kms_driven', 'age', 'power']
numerics_df = df[numeric_cols].reset_index(drop=True)

In [9]:
encoder_owner = OneHotEncoder(sparse_output = False,drop = None)
encoder_city = OneHotEncoder(sparse_output=False, drop=None)
encoder_brand = OneHotEncoder(sparse_output=False, drop=None)

In [10]:
owner_en = encoder_owner.fit_transform(df[['owner']])
city = encoder_city.fit_transform(df[['city']])

In [11]:
owner_df = pd.DataFrame(owner_en,columns=encoder_owner.get_feature_names_out(['owner']))
city_df = pd.DataFrame(city,columns=encoder_city.get_feature_names_out(['city']))

In [12]:
df_encoded = pd.concat([numerics_df, owner_df, city_df], axis=1)

In [13]:
df_encoded.head(10)

Unnamed: 0,kms_driven,age,power,owner_First Owner,owner_Fourth Owner Or More,owner_Second Owner,owner_Third Owner,city_24 Pargana,city_Abohar,city_Adalaj,...,city_Viramgam,city_Virar,city_Virudhunagar,city_Visakhapatnam,city_Vizianagaram,city_Warangal,city_Wardha,city_Yamuna Nagar,city_Yemmiganur,city_Zirakpur
0,17654.0,3.0,110.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11000.0,4.0,350.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,110.0,8.0,675.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16329.0,4.0,180.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10000.0,3.0,150.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,25000.0,6.0,150.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,8200.0,3.0,160.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,12645.0,3.0,100.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9190.0,3.0,350.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,19000.0,7.0,500.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
x = df_encoded
y = df['price'].values

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.7)

In [20]:
start = time.time()
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [21]:
model = RandomForestRegressor(n_jobs=-1)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(x_train, y_train)
end = time.time()

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [22]:
y_pred = grid_search.predict(x_test)
y_train_pred = grid_search.predict(x_train)
r2_train = r2_score(y_train,y_train_pred)
r2 = r2_score(y_test, y_pred)

In [25]:
print(grid_search.best_params_)
print("Training time: {:.2f} seconds".format(end - start))
print("R² Train Score:", r2_train*100)
print("R² Score:", r2*100)

{'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 150}
Training time: 243.43 seconds
R² Train Score: 98.18882881002159
R² Score: 91.7809305207859


In [26]:
df['price_category'] = pd.qcut(df['price'],q=4,labels=[0,1,2,3])
y_cl = df['price_category'].astype(int)

In [27]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [28]:
x_train_cls, x_test_cls, y_train_cls, y_test_cls = train_test_split(x_scaled, y_cl, test_size=0.7)

In [29]:
start1 = time.time()
log = LogisticRegression(max_iter=100000)
log.fit(x_train_cls, y_train_cls)
end1 = time.time()

In [30]:
y_train_pred_log = log.predict(x_train_cls)
y_test_pred_log = log.predict(x_test_cls)
train_accuracy = accuracy_score(y_train_cls, y_train_pred_log)
test_accuracy = accuracy_score(y_test_cls, y_test_pred_log)

In [31]:
print("Training time: {:.2f} seconds".format(end1 - start1))
print("Accuracy Train Score:", train_accuracy * 100)
print("Accuracy Test Score:", test_accuracy * 100)

Training time: 1.65 seconds
Accuracy Train Score: 91.15785174596692
Accuracy Test Score: 89.6035704909425
