In [None]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# STEP 1: Load the raw dataset
df = pd.read_csv(r"D:\MY WORK\Used_Bikes\Used_Bikes.csv")

# STEP 2: Remove duplicate rows
df.drop_duplicates(inplace=True)

# STEP 3: Label Encode all categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))  # convert to string for safe handling

# STEP 4: Convert specific columns to numeric, in case they were stored as strings
df[['bike_name', 'price', 'city', 'kms_driven', 'owner', 'age', 'power', 'brand']] = \
    df[['bike_name', 'price', 'city', 'kms_driven', 'owner', 'age', 'power', 'brand']].apply(pd.to_numeric, errors='coerce')

# OPTIONAL: Save the cleaned and encoded data (you can skip if not needed)
df.to_csv(r"C:\Users\Administrator\Desktop\New folder\encoded_cleaned_used_bikes2.csv", index=False)

# STEP 5: Split features and target
X = df.drop('price', axis=1)
y = df['price']

# STEP 6: Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# STEP 7: Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# STEP 8: Predict on test set
y_pred = model.predict(X_test)

# STEP 9: Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation results
print(f"MAE: ₹{mae:.2f}")
print(f"MSE: ₹{mse:.2f}")
print(f"R² Score: {r2:.2f}")

from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load

# Train your model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Save the model
dump(model, 'random_forest_model.joblib')

# Later or in another script: Load the model
loaded_model = load('random_forest_model.joblib')

# Use the loaded model
y_pred = loaded_model.predict(X_test)
print(y_pred)


MAE: ₹14096.89
MSE: ₹2359659612.42
R² Score: 0.88
[110000. 110000.  66000. ...  52500.  95000.  79450.]
