In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Load Data

In [2]:
df = pd.read_csv('../data/CarPrice_Assignment.csv')

In [3]:
df.shape

(205, 26)

In [4]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


# Data Preprocessing

In [5]:
df.dtypes

car_ID                int64
symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object

In [6]:
df.drop(columns=['car_ID'], inplace=True)

In [7]:
object_cols = {col: df[col].nunique() for col in df if df[col].dtype == object}

In [8]:
object_cols

{'CarName': 147,
 'fueltype': 2,
 'aspiration': 2,
 'doornumber': 2,
 'carbody': 5,
 'drivewheel': 3,
 'enginelocation': 2,
 'enginetype': 7,
 'cylindernumber': 7,
 'fuelsystem': 8}

In [9]:
object_cols.pop('CarName')

147

In [10]:
df = df.astype(
    {
        'fueltype': 'category',
        'aspiration': 'category',
        'doornumber': 'category',
        'carbody': 'category',
        'drivewheel': 'category',
        'enginelocation': 'category',
        'enginetype': 'category',
        'cylindernumber': 'category',
        'fuelsystem': 'category',
    }   
)

In [11]:
df.dtypes

symboling              int64
CarName               object
fueltype            category
aspiration          category
doornumber          category
carbody             category
drivewheel          category
enginelocation      category
wheelbase            float64
carlength            float64
carwidth             float64
carheight            float64
curbweight             int64
enginetype          category
cylindernumber      category
enginesize             int64
fuelsystem          category
boreratio            float64
stroke               float64
compressionratio     float64
horsepower             int64
peakrpm                int64
citympg                int64
highwaympg             int64
price                float64
dtype: object

In [12]:
df.head()

Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


## Check Missing Values

In [13]:
df.isna().sum()

symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [14]:
def car_name_split(car_name):
    company_name = car_name.split()
    return company_name[0]

In [15]:
df['CarName'] = df['CarName'].apply(car_name_split)

In [16]:
df.rename(columns={'CarName': 'company_name'}, inplace=True)

In [17]:
df['company_name'] = df['company_name'].astype('category')

In [18]:
df['company_name'] = df['company_name'].cat.codes

In [19]:
df.dtypes

symboling              int64
company_name            int8
fueltype            category
aspiration          category
doornumber          category
carbody             category
drivewheel          category
enginelocation      category
wheelbase            float64
carlength            float64
carwidth             float64
carheight            float64
curbweight             int64
enginetype          category
cylindernumber      category
enginesize             int64
fuelsystem          category
boreratio            float64
stroke               float64
compressionratio     float64
horsepower             int64
peakrpm                int64
citympg                int64
highwaympg             int64
price                float64
dtype: object

In [20]:
for col in object_cols:
    df[col] = df[col].cat.codes

In [21]:
df.head(5)

Unnamed: 0,symboling,company_name,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,1,1,0,1,0,2,0,88.6,168.8,...,130,5,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,1,1,0,1,0,2,0,88.6,168.8,...,130,5,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,1,1,0,1,2,2,0,94.5,171.2,...,152,5,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,2,1,0,0,3,1,0,99.8,176.6,...,109,5,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,2,1,0,0,3,0,0,99.4,176.6,...,136,5,3.19,3.4,8.0,115,5500,18,22,17450.0


In [22]:
df.describe()

Unnamed: 0,symboling,company_name,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,...,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,14.468293,0.902439,0.180488,0.439024,2.614634,1.326829,0.014634,98.756585,174.049268,...,126.907317,3.253659,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,1.245307,7.486931,0.297446,0.385535,0.497483,0.859081,0.556171,0.120377,6.021776,12.337289,...,41.642693,2.013204,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86.6,141.1,...,61.0,0.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,0.0,7.0,1.0,0.0,0.0,2.0,1.0,0.0,94.5,166.3,...,97.0,1.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,1.0,14.0,1.0,0.0,0.0,3.0,1.0,0.0,97.0,173.2,...,120.0,5.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,2.0,22.0,1.0,0.0,1.0,3.0,2.0,0.0,102.4,183.1,...,141.0,5.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,3.0,27.0,1.0,1.0,1.0,4.0,2.0,1.0,120.9,208.1,...,326.0,7.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


# Feature Selection

In [23]:
df.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,symboling,company_name,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
symboling,1.0,-0.091524,0.194311,-0.059866,0.664073,-0.596135,-0.041671,0.212471,-0.531954,-0.357612,-0.232919,-0.541038,-0.227691,0.050372,0.197762,-0.10579,0.091163,-0.130051,-0.008735,-0.178515,0.070873,0.273606,-0.035823,0.034606,-0.079978
company_name,-0.091524,1.0,-0.063029,0.021522,-0.164705,0.096025,-0.041645,0.052188,0.006009,0.053227,-0.074974,0.198579,-0.053644,-0.074119,0.049168,-0.15954,0.138105,0.199395,-0.202581,0.095275,-0.089892,-0.138929,0.083559,0.09535,-0.231594
fueltype,0.194311,-0.063029,1.0,-0.401397,0.191491,-0.147853,-0.132257,0.04007,-0.308346,-0.212679,-0.23388,-0.284631,-0.217275,0.082695,0.110617,-0.069594,0.041529,-0.054451,-0.241829,-0.984356,0.163926,0.476883,-0.255963,-0.191392,-0.105679
aspiration,-0.059866,0.021522,-0.401397,1.0,-0.031792,0.063028,0.066465,-0.057191,0.257611,0.234539,0.300567,0.087311,0.324902,-0.102963,-0.133119,0.108217,0.288086,0.212614,0.222982,0.295541,0.241685,-0.183383,-0.202362,-0.254416,0.177926
doornumber,0.664073,-0.164705,0.191491,-0.031792,1.0,-0.680358,0.098954,0.137757,-0.447357,-0.398568,-0.207168,-0.552208,-0.197379,0.062431,0.154322,-0.020742,0.015519,-0.119258,0.011082,-0.177888,0.126947,0.247668,0.012417,0.03633,-0.031835
carbody,-0.596135,0.096025,-0.147853,0.063028,-0.680358,1.0,-0.155745,-0.277009,0.401362,0.334433,0.13171,0.568534,0.128467,-0.037024,-0.048408,-0.073352,-0.065079,0.010549,-0.015325,0.136243,-0.153928,-0.109643,0.031697,-0.00717,-0.083976
drivewheel,-0.041671,-0.041645,-0.132257,0.066465,0.098954,-0.155745,1.0,0.147865,0.459745,0.485649,0.470751,-0.019719,0.575111,-0.116823,0.223238,0.524307,0.424686,0.481827,0.071591,0.127479,0.518686,-0.039417,-0.449581,-0.45222,0.577992
enginelocation,0.212471,0.052188,0.04007,-0.057191,0.137757,-0.277009,0.147865,1.0,-0.18779,-0.050989,-0.051698,-0.106234,0.050468,0.114127,0.135541,0.196826,0.105971,0.185042,-0.138455,-0.019762,0.317839,0.198461,-0.153487,-0.102026,0.324973
wheelbase,-0.531954,0.006009,-0.308346,0.257611,-0.447357,0.401362,0.459745,-0.18779,1.0,0.874587,0.795144,0.589435,0.776386,-0.135577,-0.184596,0.569329,0.384601,0.48875,0.160959,0.249786,0.353294,-0.360469,-0.470414,-0.544082,0.577816
carlength,-0.357612,0.053227,-0.212679,0.234539,-0.398568,0.334433,0.485649,-0.050989,0.874587,1.0,0.841118,0.491029,0.877728,-0.113291,-0.109585,0.68336,0.55781,0.606454,0.129533,0.158414,0.552623,-0.287242,-0.670909,-0.704662,0.68292


In [24]:
abs(df.corr()[['price']]).style.background_gradient(cmap='Greens')

Unnamed: 0,price
symboling,0.079978
company_name,0.231594
fueltype,0.105679
aspiration,0.177926
doornumber,0.031835
carbody,0.083976
drivewheel,0.577992
enginelocation,0.324973
wheelbase,0.577816
carlength,0.68292


In [25]:
cols = [
    'company_name',
    'drivewheel',
    'wheelbase',
    'carlength',
    'carwidth',
    'curbweight',
    'enginesize',
    'fuelsystem',
    'boreratio',
    'horsepower',
    'citympg',
    'highwaympg',
    'price'
]
df_selected = df[cols]

In [26]:
df_selected.head(5)

Unnamed: 0,company_name,drivewheel,wheelbase,carlength,carwidth,curbweight,enginesize,fuelsystem,boreratio,horsepower,citympg,highwaympg,price
0,1,2,88.6,168.8,64.1,2548,130,5,3.47,111,21,27,13495.0
1,1,2,88.6,168.8,64.1,2548,130,5,3.47,111,21,27,16500.0
2,1,2,94.5,171.2,65.5,2823,152,5,2.68,154,19,26,16500.0
3,2,1,99.8,176.6,66.2,2337,109,5,3.19,102,24,30,13950.0
4,2,0,99.4,176.6,66.4,2824,136,5,3.19,115,18,22,17450.0


In [27]:
df_selected.shape

(205, 13)

# Train Model

In [28]:
X = df_selected.drop(columns=['price'])
y = df_selected['price']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [30]:
X_train.shape

(153, 12)

In [31]:
X_test.shape

(52, 12)

In [32]:
model = LinearRegression()

In [33]:
model.fit(X_train, y_train)

In [34]:
model.score(X_train, y_train)

0.8384061158339297

In [35]:
y_pred = model.predict(X_test)

In [36]:
print('R^2:',metrics.r2_score(y_test, y_pred))
print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R^2: 0.8391290314460698
MAE: 2365.5569267567744
MSE: 10884455.230029019
RMSE: 3299.159776371708
