In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
cars = pd.read_csv(r"C:\Logs\car_age_price.csv")

In [3]:
cars.head()

Unnamed: 0,Year,Price
0,2018,465000
1,2019,755000
2,2019,700000
3,2018,465000
4,2018,465000


In [4]:
cars.isna().sum()

Year     0
Price    0
dtype: int64

In [5]:
cars.describe()

Unnamed: 0,Year,Price
count,112.0,112.0
mean,2016.669643,483866.044643
std,1.629616,91217.450533
min,2013.0,300000.0
25%,2015.0,423750.0
50%,2017.0,500000.0
75%,2017.0,550000.0
max,2020.0,755000.0


In [6]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Year    112 non-null    int64
 1   Price   112 non-null    int64
dtypes: int64(2)
memory usage: 1.9 KB


## modeling without scaling

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
x = cars.drop(['Price'], axis=1)
y = cars['Price']

In [9]:
x_train, x_test,y_train,y_test = train_test_split(x,y, random_state=45, test_size=0.25 )
le = LinearRegression()
le.fit(x_train,y_train)
y_pred = le.predict(x_test)

In [10]:
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred)) 

3728680814.123775
0.5942628070198248


In [11]:
# lets do the same process but this time with scalling to check whether it gives us a better model

## modeling with scaling

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()
cars = scaler.fit_transform(cars)

In [14]:
cars = pd.DataFrame(cars, columns=['Year','Price'])

In [15]:
cars.describe()

Unnamed: 0,Year,Price
count,112.0,112.0
mean,-1.979964e-14,-3.122502e-16
std,1.004494,1.004494
min,-2.261967,-2.024749
25%,-1.029167,-0.6620031
50%,0.203632,0.1776685
75%,0.203632,0.7282728
max,2.052831,2.98575


In [16]:
x = cars.drop(['Price'], axis=1)
y = cars['Price']

In [17]:
x_train, x_test,y_train,y_test = train_test_split(x,y, random_state=45, test_size=0.25 )
le = LinearRegression()
le.fit(x_train,y_train)
y_pred = le.predict(x_test)

In [18]:
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred)) 

0.45216234478924744
0.5942628070198248


In [19]:
# with standard scalling we have got a better fitiing model using linear regression

## Lasso regression

In [20]:
# now lets check if we get a better model using lasso regression

In [21]:
from sklearn.linear_model import Lasso

In [22]:
las = Lasso()
las.fit(x_train, y_train)
y_predict = las.predict(x_test)
print(mean_squared_error(y_test, y_predict))
print(r2_score(y_test, y_predict))

1.166874438526964
-0.047067197665299876


In [23]:
# here I have used the same xtrain and ytrain from the model and it is clear that linear regression gives us a better model
# for this dataset

## Prediction for 2022 grand i10 car

In [26]:
car_predict = le.predict(np.array([2022]).reshape(1, -1))



In [27]:
car_predict

array([1585.85377039])

In [28]:
# this is scaled price of 2022 grand i10 car to get the original price we need to reverse the scaling we did on the database