##Import the Data

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [17]:
df = pd.read_csv('/content/CarPrice_Assignment.csv')
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [18]:
df.shape

(205, 26)

In [19]:
df.isnull().sum()

Unnamed: 0,0
car_ID,0
symboling,0
CarName,0
fueltype,0
aspiration,0
doornumber,0
carbody,0
drivewheel,0
enginelocation,0
wheelbase,0


In [20]:
df.info() #Another way to see the null value

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [21]:
df.drop(['car_ID', 'CarName'], axis=1, inplace=True)


In [22]:
df.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [23]:
for col in df.columns:
  if df[col].dtype == 'object':
    print(col, df[col].value_counts())
    print()

fueltype fueltype
gas       185
diesel     20
Name: count, dtype: int64

aspiration aspiration
std      168
turbo     37
Name: count, dtype: int64

doornumber doornumber
four    115
two      90
Name: count, dtype: int64

carbody carbody
sedan          96
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: count, dtype: int64

drivewheel drivewheel
fwd    120
rwd     76
4wd      9
Name: count, dtype: int64

enginelocation enginelocation
front    202
rear       3
Name: count, dtype: int64

enginetype enginetype
ohc      148
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: count, dtype: int64

cylindernumber cylindernumber
four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: count, dtype: int64

fuelsystem fuelsystem
mpfi    94
2bbl    66
idi     20
1bbl    11
spdi     9
4bbl     3
mfi      1
spfi     1
Name: count, dtype: int64



##We dont want to replace one by one ... so we will use LabelEncoder that will automatically place values in it.

In [26]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in df.columns:
  if df[col].dtype == 'object':
    df[col] = le.fit_transform(df[col])

In [27]:
df.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,1,0,1,0,2,0,88.6,168.8,64.1,...,130,5,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,1,0,1,0,2,0,88.6,168.8,64.1,...,130,5,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,1,0,1,2,2,0,94.5,171.2,65.5,...,152,5,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,1,0,0,3,1,0,99.8,176.6,66.2,...,109,5,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,1,0,0,3,0,0,99.4,176.6,66.4,...,136,5,3.19,3.4,8.0,115,5500,18,22,17450.0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   symboling         205 non-null    int64  
 1   fueltype          205 non-null    int64  
 2   aspiration        205 non-null    int64  
 3   doornumber        205 non-null    int64  
 4   carbody           205 non-null    int64  
 5   drivewheel        205 non-null    int64  
 6   enginelocation    205 non-null    int64  
 7   wheelbase         205 non-null    float64
 8   carlength         205 non-null    float64
 9   carwidth          205 non-null    float64
 10  carheight         205 non-null    float64
 11  curbweight        205 non-null    int64  
 12  enginetype        205 non-null    int64  
 13  cylindernumber    205 non-null    int64  
 14  enginesize        205 non-null    int64  
 15  fuelsystem        205 non-null    int64  
 16  boreratio         205 non-null    float64
 1

### So all data has been converted in Number / float ...

## Data training


In [29]:
X = df.drop('price', axis=1)
Y = df['price']

## Train test and split


In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2)

In [35]:
scaller = StandardScaler()
X_train_scaled = scaller.fit_transform(X_train)
X_test_scaled = scaller.transform(X_test)

##Model


In [40]:
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, Y_train)

## Train and test score


In [43]:
print("R2 score on Training (Linear Regression): ")
Y_train_pred = model_lr.predict(X_train_scaled)
print(r2_score(Y_train, Y_train_pred))

print ("R2 score on Testing (Linear Regression): ")
Y_test_pred = model_lr.predict(X_test_scaled)
print(r2_score(Y_test, Y_test_pred))

R2 score on Training (Linear Regression): 
0.8863502144708981
R2 score on Testing (Linear Regression): 
0.8357004420274707


### As we have all data ready and we have already scaled data that can be used to train linear regression on random forest.

In [57]:
model_rf = RandomForestRegressor()
model_rf.fit(X_train_scaled, Y_train)

In [58]:
print("R2 score on Training (Random Forest): ")
Y_train_pred = model_rf.predict(X_train_scaled)
print(r2_score(Y_train, Y_train_pred))

print ("R2 score on Testing (Random Forest): ")
Y_test_pred = model_rf.predict(X_test_scaled)
print(r2_score(Y_test, Y_test_pred))

R2 score on Training (Random Forest): 
0.989379331308698
R2 score on Testing (Random Forest): 
0.889319087136299


## Plynomial Regression
It is not a model on sklearn if we search it so, it is act as preprocessing

In [59]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

In [61]:
model_poly_lr = LinearRegression()
model_poly_lr.fit(X_train_poly, Y_train)

In [63]:
print("Train R2 score (Polynomial Regression): ")
Y_train_pred = model_poly_lr.predict(X_train_poly)
print(r2_score(Y_train, Y_train_pred))

print ("Test R2 score (Polynomial Regression): ")
Y_train_pred = model_poly_lr.predict(X_train_poly)
print(r2_score(Y_train, Y_train_pred))

Train R2 score (Polynomial Regression): 
0.9990125610573479
Test R2 score (Polynomial Regression): 
0.9990125610573479
