In [4]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [5]:
data = pd.read_csv("CarPrice_Assignment.csv")

In [6]:
data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [7]:
data.shape

(205, 26)

In [8]:
data.isnull().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [9]:
data.dtypes

car_ID                int64
symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object

In [10]:
cat_cols = [col for col in data.columns if data[col].dtype == object]
num_cols = [col for col in data.columns if col not in cat_cols]

In [11]:
cat_cols

['CarName',
 'fueltype',
 'aspiration',
 'doornumber',
 'carbody',
 'drivewheel',
 'enginelocation',
 'enginetype',
 'cylindernumber',
 'fuelsystem']

In [12]:
[data[col].value_counts() for col in cat_cols]

[toyota corona           6
 toyota corolla          6
 peugeot 504             6
 subaru dl               4
 mitsubishi mirage g4    3
                        ..
 mazda glc 4             1
 mazda rx2 coupe         1
 maxda glc deluxe        1
 maxda rx3               1
 volvo 246               1
 Name: CarName, Length: 147, dtype: int64,
 gas       185
 diesel     20
 Name: fueltype, dtype: int64,
 std      168
 turbo     37
 Name: aspiration, dtype: int64,
 four    115
 two      90
 Name: doornumber, dtype: int64,
 sedan          96
 hatchback      70
 wagon          25
 hardtop         8
 convertible     6
 Name: carbody, dtype: int64,
 fwd    120
 rwd     76
 4wd      9
 Name: drivewheel, dtype: int64,
 front    202
 rear       3
 Name: enginelocation, dtype: int64,
 ohc      148
 ohcf      15
 ohcv      13
 dohc      12
 l         12
 rotor      4
 dohcv      1
 Name: enginetype, dtype: int64,
 four      159
 six        24
 five       11
 eight       5
 two         4
 three       1

In [13]:
cat_cols = [col for col in cat_cols if col not in ["CarName"]]

In [14]:
num_cols.remove("car_ID")

In [15]:
num_cols

['symboling',
 'wheelbase',
 'carlength',
 'carwidth',
 'carheight',
 'curbweight',
 'enginesize',
 'boreratio',
 'stroke',
 'compressionratio',
 'horsepower',
 'peakrpm',
 'citympg',
 'highwaympg',
 'price']

In [16]:
data = pd.get_dummies(data, columns = cat_cols, drop_first = True)
data.head()

Unnamed: 0,car_ID,symboling,CarName,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,...,cylindernumber_three,cylindernumber_twelve,cylindernumber_two,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,1,3,alfa-romero giulia,88.6,168.8,64.1,48.8,2548,130,3.47,...,0,0,0,0,0,0,0,1,0,0
1,2,3,alfa-romero stelvio,88.6,168.8,64.1,48.8,2548,130,3.47,...,0,0,0,0,0,0,0,1,0,0
2,3,1,alfa-romero Quadrifoglio,94.5,171.2,65.5,52.4,2823,152,2.68,...,0,0,0,0,0,0,0,1,0,0
3,4,2,audi 100 ls,99.8,176.6,66.2,54.3,2337,109,3.19,...,0,0,0,0,0,0,0,1,0,0
4,5,2,audi 100ls,99.4,176.6,66.4,54.3,2824,136,3.19,...,0,0,0,0,0,0,0,1,0,0


In [17]:
data.drop(["car_ID", "CarName"], axis = 1 ,inplace=True)

In [18]:
y = data["price"]
x = data.drop("price", axis = 1)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.20)

In [20]:
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

In [21]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [22]:
score = model.score(x_test, y_test)
score

0.9234516311524967