In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('car data.csv')

In [7]:
data.head(10)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0
5,vitara brezza,2018,9.25,9.83,2071,Diesel,Dealer,Manual,0
6,ciaz,2015,6.75,8.12,18796,Petrol,Dealer,Manual,0
7,s cross,2015,6.5,8.61,33429,Diesel,Dealer,Manual,0
8,ciaz,2016,8.75,8.89,20273,Diesel,Dealer,Manual,0
9,ciaz,2015,7.45,8.92,42367,Diesel,Dealer,Manual,0


## Preprocess the Data

In [10]:
print("Dataset Info:")
data.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Driven_kms     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Selling_type   301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [12]:
data.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64

In [16]:
# Create a new feature: Car Age
current_year = 2025  # As per the current date
data['Car_Age'] = current_year - data['Year']
data.head(5)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Car_Age
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,11
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,12
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,8
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,14
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,11


In [18]:
# Encode categorical variables
data = pd.get_dummies(data, columns=['Fuel_Type', 'Selling_type', 'Transmission'], drop_first=True)
data.head(5)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Owner,Car_Age,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Individual,Transmission_Manual
0,ritz,2014,3.35,5.59,27000,0,11,False,True,False,True
1,sx4,2013,4.75,9.54,43000,0,12,True,False,False,True
2,ciaz,2017,7.25,9.85,6900,0,8,False,True,False,True
3,wagon r,2011,2.85,4.15,5200,0,14,False,True,False,True
4,swift,2014,4.6,6.87,42450,0,11,True,False,False,True


In [20]:
# Drop Car_Name as it's not directly useful for prediction (too many unique values)
data = data.drop(['Car_Name', 'Year'], axis=1)
data.head(5)

Unnamed: 0,Selling_Price,Present_Price,Driven_kms,Owner,Car_Age,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Individual,Transmission_Manual
0,3.35,5.59,27000,0,11,False,True,False,True
1,4.75,9.54,43000,0,12,True,False,False,True
2,7.25,9.85,6900,0,8,False,True,False,True
3,2.85,4.15,5200,0,14,False,True,False,True
4,4.6,6.87,42450,0,11,True,False,False,True


In [22]:
# Split features and target
X = data.drop('Selling_Price', axis=1)
y = data['Selling_Price']

## Train the Model

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [33]:
# Train Random Forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [39]:
import warnings
warnings.filterwarnings('ignore')
#Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Absolute Error: {mae:.2f} lakhs")
print(f"Mean Squared Error: {mse:.2f} lakhs")
print(f"R² Score: {r2:.2f}")


Model Evaluation:
Mean Absolute Error: 14.45 lakhs
Mean Squared Error: 284.95 lakhs
R² Score: -11.37


In [54]:
# Example prediction
# Sample input: [Present_Price, Driven_kms, Owner, Car_Age, Fuel_Type_Diesel, Fuel_Type_Petrol, Selling_type_Individual, Transmission_Manual]
sample_input = np.array([[5.59, 30000, 0, 11, 0, 1, 0, 0]])

input_scaled = scaler.transform(sample_input)

prediction = model.predict(input_scaled)

print(f"\nPredicted Selling Price for sample car: {prediction[0]:.2f} lakhs")



Predicted Selling Price for sample car: 3.98 lakhs
