In [2]:
import pandas as pd

df = pd.read_csv("cardata.csv")

df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [3]:
df.shape

(301, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [6]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [8]:
df['Fuel_Type'].value_counts()

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64

In [9]:
df['Seller_Type'].value_counts()


Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64

In [10]:
df['Transmission'].value_counts()


Transmission
Manual       261
Automatic     40
Name: count, dtype: int64

In [11]:
df['Owner'].value_counts()


Owner
0    290
1     10
3      1
Name: count, dtype: int64

In [3]:
car_dataset= df.copy()

In [4]:
car_dataset['Fuel_Type'] = car_dataset['Fuel_Type'].replace('CNG', 'Petrol')

In [5]:
car_dataset['Fuel_Type'].value_counts()

Fuel_Type
Petrol    241
Diesel     60
Name: count, dtype: int64

In [6]:
car_dataset['Fuel_Type'] = car_dataset['Fuel_Type'].map({'Petrol':0, 'Diesel':1})
car_dataset['Seller_Type'] = car_dataset['Seller_Type'].map({'Dealer':0, 'Individual':1})
car_dataset['Transmission'] = car_dataset['Transmission'].map({'Manual':0, 'Automatic':1})

In [7]:
car_dataset['Fuel_Type'].value_counts()



Fuel_Type
0    241
1     60
Name: count, dtype: int64

In [24]:
car_dataset['Seller_Type'].value_counts()

Seller_Type
0    195
1    106
Name: count, dtype: int64

In [25]:
car_dataset['Transmission'].value_counts()

Transmission
0    261
1     40
Name: count, dtype: int64

In [9]:
# Owner merge: 0 vs >0
car_dataset['Owner'] = car_dataset['Owner'].apply(
    lambda x: 0 if x == 0 else 1
)

In [10]:
car_dataset['Owner'].value_counts()

Owner
0    290
1     11
Name: count, dtype: int64

In [20]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# ‚ùó IMPORTANT: Car_Name yahan bhi drop hona chahiye
X = car_dataset.drop(['Selling_Price', 'Car_Name'], axis=1)
y = car_dataset['Selling_Price']

lr = LinearRegression()

scoring = {
    'r2': 'r2',
    'mae': 'neg_mean_absolute_error',
    'rmse': make_scorer(
        lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
        greater_is_better=False
    )
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_validate(
    lr, X, y, cv=cv, scoring=scoring
)

print("CV R2:", cv_results['test_r2'].mean())
print("CV MAE:", -cv_results['test_mae'].mean())
print("CV RMSE:", -cv_results['test_rmse'].mean())

CV R2: 0.835895297109845
CV MAE: 1.2606631480977477
CV RMSE: 2.0184984800173456


In [23]:
# ===============================
# LASSO REGRESSION (WITH SCALING)
# ===============================

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import mean_squared_error, make_scorer

# Features & Target
X = car_dataset.drop(['Selling_Price', 'Car_Name' ],axis=1)
y = car_dataset['Selling_Price']

# Pipeline: Scaling + Lasso
lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso(alpha=0.1, random_state=42))
])

# Scoring (same metrics as Linear Regression)
scoring = {
    'r2': 'r2',
    'mae': 'neg_mean_absolute_error',
    'rmse': make_scorer(
        lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
        greater_is_better=False
    )
}

# Cross Validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_validate(
    lasso_pipeline,
    X,
    y,
    cv=cv,
    scoring=scoring
)

# CV Results
print("Lasso CV R2:", cv_results['test_r2'].mean())
print("Lasso CV MAE:", -cv_results['test_mae'].mean())
print("Lasso CV RMSE:", -cv_results['test_rmse'].mean())

# ===============================
# COEFFICIENT CHECK (FEATURE SELECTION)
# ===============================

lasso_pipeline.fit(X, y)
lasso_model = lasso_pipeline.named_steps['lasso']

print("\nLasso Coefficients:")
for feature, coef in zip(X.columns, lasso_model.coef_):
    print(f"{feature}: {coef}")

Lasso CV R2: 0.8418301230749379
Lasso CV MAE: 1.2372818254038527
Lasso CV RMSE: 1.9742482281232732

Lasso Coefficients:
Year: 1.136940251320996
Present_Price: 3.7401991653727347
Kms_Driven: -0.14607678692817816
Fuel_Type: 0.6756361762243276
Seller_Type: -0.48777415710052474
Transmission: 0.385368751771777
Owner: -0.0
