In [1]:
# IMPORTING LIBRARIES
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import joblib

In [3]:
data = pd.read_csv(r"C:\Users\ashok\Downloads\bikes (1).csv")

In [5]:
print(data.head())
print(data.info()) 

                         model_name  model_year      kms_driven        owner  \
0     Bajaj Avenger Cruise 220 2017        2017        17000 Km  first owner   
1  Royal Enfield Classic 350cc 2016        2016        50000 Km  first owner   
2               Hyosung GT250R 2012        2012        14795 Km  first owner   
3        Bajaj Dominar 400 ABS 2017        2017  Mileage 28 Kms  first owner   
4             Jawa Perak 330cc 2020        2020         2000 Km  first owner   

      location       mileage      power   price  
0    hyderabad  \n\n 35 kmpl     19 bhp   63500  
1    hyderabad  \n\n 35 kmpl  19.80 bhp  115000  
2    hyderabad  \n\n 30 kmpl     28 bhp  300000  
3  pondicherry   \n\n 28 Kms  34.50 bhp  100000  
4    bangalore         \n\n      30 bhp  197500  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7857 entries, 0 to 7856
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   model_name  7857 non-null   

In [17]:
# Ensure 'kms_driven' is a string and remove non-numeric characters
data['kms_driven'] = data['kms_driven'].astype(str).str.replace(r'\D', '', regex=True)

# Convert to numeric, setting errors='coerce' to handle any remaining non-numeric values
data['kms_driven'] = pd.to_numeric(data['kms_driven'], errors='coerce')

# Check if conversion worked
print(data['kms_driven'].dtype)

# Fill missing values using the median (without inplace=True)
data['kms_driven'] = data['kms_driven'].fillna(data['kms_driven'].median())

# Verify that all values are numeric
print(data['kms_driven'].head())
print(data['kms_driven'].isna().sum())  # Should be 0 if everything is fixed



int64
0    170000
1    500000
2    147950
3       280
4     20000
Name: kms_driven, dtype: int64
0


In [19]:
# Extract numeric values correctly and ensure conversion to numeric
data['mileage'] = data['mileage'].astype(str).str.extract(r'(\d+\.?\d*)')[0]  # Extract as Series
data['mileage'] = pd.to_numeric(data['mileage'], errors='coerce')  # Convert to numeric

# Assign the filled values back to the column (avoid inplace=True)
data['mileage'] = data['mileage'].fillna(data['mileage'].median())

# Verify output
print(data['mileage'].dtype)  # Should be float64
print(data['mileage'].head())  # Check first few values
print(data['mileage'].isna().sum())  # Should be 0 if all NaNs are handled



float64
0    35.0
1    35.0
2    30.0
3    28.0
4    40.0
Name: mileage, dtype: float64
0


In [23]:
# Extract numeric values correctly and ensure conversion to numeric
data['power'] = data['power'].astype(str).str.extract(r'(\d+\.?\d*)')[0]  # Extract as Series
data['power'] = pd.to_numeric(data['power'], errors='coerce')  # Convert to numeric

# Assign the filled values back to the column (avoid inplace=True)
data['power'] = data['power'].fillna(data['power'].median())

# Verify output
print(data['power'].dtype)  # Should be float64
print(data['power'].head())  # Check first few values
print(data['power'].isna().sum())  # Should be 0 if all NaNs are handled


float64
0    19.0
1    19.8
2    28.0
3    34.5
4    30.0
Name: power, dtype: float64
0


In [27]:
# Factorize location (encoding categorical data)
data['location'] = pd.factorize(data['location'])[0]  

In [29]:
# Check data types to ensure correct conversion
print(data[['mileage', 'power', 'location']].dtypes)

mileage     float64
power       float64
location      int64
dtype: object


In [31]:
scaler = StandardScaler()
data[['kms_driven', 'mileage', 'power']] = scaler.fit_transform(data[['kms_driven', 'mileage', 'power']])

In [33]:
# SPLITTING DATA INTO FEATURES (X) & TARGET (y)
X = data.drop('price', axis=1)  # 'price' is the target variable
y = data['price']

In [35]:
# ENCODING CATEGORICAL VARIABLES
if 'owner' in X.columns:
    le = LabelEncoder()
    X['owner'] = le.fit_transform(X['owner'])

categorical_cols = [col for col in ['model_name', 'owner'] if col in X.columns]
if categorical_cols:
    X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [37]:
# TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
# TRAINING RANDOM FOREST MODEL
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [41]:
# PREDICTIONS
y_pred_rf = rf_model.predict(X_test)

In [43]:
# MODEL EVALUATION
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, y_pred_rf)

In [45]:
print("Random Forest Performance:")
print(f"RMSE: {rf_rmse}")
print(f"R² Score: {rf_r2}")

Random Forest Performance:
RMSE: 86010.02612835006
R² Score: 0.6526278720642907


In [47]:
joblib.dump(rf_model, 'bike_price_random_forest.pkl')

['bike_price_random_forest.pkl']