#Necessary Libraries

In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

#Loading Dataset

In [67]:
# Load the dataset
df = pd.read_csv('quikr_car.csv')
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


#Data Cleaning

In [68]:
# Dataset structure
print(df.shape)
print(df.info())
print(df.isnull().sum())
df.head()

(892, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB
None
name           0
company        0
year           0
Price          0
kms_driven    52
fuel_type     55
dtype: int64


Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [69]:
# Drop rows with 'Ask For Price' in Price
df = df[df['Price'] != 'Ask For Price']

In [70]:
# Clean the Price column
df['Price'] = df['Price'].str.replace(',', '').astype(int)

In [71]:
# Clean kms_driven
df['kms_driven'] = df['kms_driven'].str.replace(',', '').str.replace(' kms', '')
df['kms_driven'] = pd.to_numeric(df['kms_driven'], errors='coerce')
df.dropna(subset=['kms_driven'], inplace=True)
df['kms_driven'] = df['kms_driven'].astype(int)

In [72]:
# Drop rows with missing fuel_type or other nulls
df.dropna(inplace=True)

In [73]:
# Extract car model (first 3 words of name)
df['car_model'] = df['name'].str.split().str.slice(0, 3).str.join(' ')

In [74]:
# Calculate car age
# Remove rows where 'year' is not numeric
df = df[df['year'].str.isnumeric()]

# Convert 'year' to int
df['year'] = df['year'].astype(int)

# Now calculate car age
df['car_age'] = 2025 - df['year']

In [75]:
# Drop irrelevant columns
df.drop(['name', 'year'], axis=1, inplace=True)

In [77]:
# Filter out extreme outliers in price
df = df[(df['Price'] > 100000) & (df['Price'] < 2000000)]

In [78]:
# One-hot encoding
df = pd.get_dummies(df, columns=['company', 'fuel_type', 'car_model'], drop_first=True)

#Model Building

In [79]:
# Features and Target
X = df.drop('Price', axis=1)
y = df['Price']

In [80]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [82]:
# Train Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

#Model Evaluation

In [83]:
# Evaluation Function
def evaluate_model(name, y_true, y_pred):
    print(f"\n📊 {name} Results:")
    print(f"R² Score  : {r2_score(y_true, y_pred):.3f}")
    print(f"MAE       : ₹{mean_absolute_error(y_true, y_pred):,.2f}")
    print(f"RMSE      : ₹{np.sqrt(mean_squared_error(y_true, y_pred)):,.2f}")

In [84]:
# Evaluate both models
evaluate_model("Linear Regression", y_test, y_pred_lr)
evaluate_model("Random Forest", y_test, y_pred_rf)


📊 Linear Regression Results:
R² Score  : 0.684
MAE       : ₹86,440.87
RMSE      : ₹131,624.73

📊 Random Forest Results:
R² Score  : 0.715
MAE       : ₹81,314.18
RMSE      : ₹125,054.98


#Predicting a Sample

In [85]:
sample = X_test.iloc[0]
print("\n🔍 Sample Prediction:")
print("Predicted Price:", int(rf.predict([sample])[0]))
print("Actual Price   :", y_test.iloc[0])


🔍 Sample Prediction:
Predicted Price: 202720
Actual Price   : 210000
