In [1]:
import pandas as pd
import numpy as np

In [2]:
car = pd.read_csv('quikr_car.csv')

In [3]:
car = car[car['year'].str.isnumeric()]                                                                            #isme .str.isnumeric() function use kiya gaya hai, iska matlab hai ki sirf un rows ko rakha jaa raha hai jisme 'year' column ki values numeric strings hain.

In [4]:
car['year'] = car['year'].astype(int)

In [5]:
car = car[car['Price']!="Ask For Price"]                                                                           # # Yeh line un rows ko filter karti hai jisme 'Price' column mein "Ask For Price" nahi hai

In [6]:
car['Price'] = car['Price'].astype(str).str.replace(',', '', regex=False).astype(int)                              #astype(str) ➝ Price ko string banata hai taaki comma ko handle kiya ja sake, str.replace(',', '', regex=False) ➝ Comma hata deta hai, jaise '1,000' ko '1000', aur astype(int) ➝ String ko integer mein convert karta hai.

In [7]:
car['kms_driven'] = car['kms_driven'].str.split(' ').str.get(0).str.replace(',','')                                # str.split(' ') ➝ 'kms_driven' column mein space ke basis par split kar raha hai.     

In [8]:
car = car[car['kms_driven'].str.isnumeric()]

In [9]:
car['kms_driven'] = car['kms_driven'].astype(int)

In [10]:
car['name'] = car['name'].str.split(' ').str.slice(0,3).str.join(' ')                                                #str.split(' ') ➝ 'name' column ki har value ko space ke basis par split kar raha hai.str.slice(0, 3) ➝ Split ki hui list ka pehla 3 shabd le raha hai.str.join(' ') ➝ Un pehle 3 shabdon ko space ke saath join kar raha hai.

In [11]:
car = car.reset_index(drop=True)                                                                         # Yeh line purane index ko reset karti hai aur naye index ko assign karti hai

In [12]:
car['fuel_type'] = car['fuel_type'].fillna(car['fuel_type'].mode()[0])

In [13]:
def cleaning_data(car, Price, threshold):
    for data in car.index:
        if car.loc[data, Price] > threshold:
            car.drop(data, inplace=True)
    return
cleaning_data(car, 'Price', 600000)

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
columns = ["name", "company", "fuel_type"]

for column in columns:
    car[column] = le.fit_transform(car[column])

In [15]:
x = car.drop(columns=['Price']) 
y = car['Price']

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [17]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

columns_to_scale = ['kms_driven', 'Price']
car[columns_to_scale] = ss.fit_transform(car[columns_to_scale])


In [53]:
from sklearn.ensemble import RandomForestRegressor

# Sirf model select kar rahe hain — no tuning, no training
rf_model = RandomForestRegressor(random_state=42)


In [55]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score

# Model define karo
model = RandomForestRegressor(random_state=42)

# K-Fold setup
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Cross-validation
scores = cross_val_score(model, x, y, cv=kf, scoring='r2')

# Output
print("R2 scores for each fold:", scores)
print("Average R2 score:", scores.mean())

R2 scores for each fold: [0.74262299 0.79917026 0.35780887 0.57484875 0.68270483 0.68282277
 0.67475028 0.74686546 0.76559996 0.66938852]
Average R2 score: 0.6696582676971878


In [67]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# Model initialization
rf_model = RandomForestRegressor(random_state=42)

# Hyperparameter Grid
param_grid = {
    'n_estimators': [100, 200, 300],         # Zyada trees for better generalization
    'max_depth': [6, 8, 10],                 # Kam depth to avoid overfitting
    'min_samples_split': [4, 6, 8],          # Higher value = stable splits
    'min_samples_leaf': [2, 4, 6]            # Bigger leaf = less overfitting
}


# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(x_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Use best model from GridSearchCV
best_rf_model = grid_search.best_estimator_

# Train the best model
best_rf_model.fit(x_train, y_train)

# Predict
y_train_pred = best_rf_model.predict(x_train)
y_test_pred = best_rf_model.predict(x_test)

# Evaluation
train_r2 = r2_score(y_train, y_train_pred) * 100
test_r2 = r2_score(y_test, y_test_pred) * 100
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_test_pred)

# Results
print(f"Train R² Score: {train_r2:.2f}%")
print(f"Test R² Score: {test_r2:.2f}%")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")


Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 100}
Train R² Score: 86.34%
Test R² Score: 75.22%
MSE: 5208620574.11
RMSE: 72170.77
MAE: 53366.10


# Final model training 

In [74]:
final_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42
)

# Full data pe train karo
final_model.fit(x, y)


In [76]:
import joblib
joblib.dump(final_model, 'final_rf_model.pkl')


['final_rf_model.pkl']