In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from google.colab import drive
drive.mount("/content/drive/")
# reading the dataset
cars=pd.read_csv('/content/drive/My Drive/CarPrice_Assignment.csv')

Mounted at /content/drive/


In [None]:
cars.head()

In [None]:
# symboling: -2 (least risky) to +3 most risky
# Most cars are 0,1,2
cars['symboling'].astype('category').value_counts()

In [None]:
# aspiration: An (internal combustion) engine property showing 
# whether the oxygen intake is through standard (atmospheric pressure)
# or through turbocharging (pressurised oxygen intake)

cars['aspiration'].astype('category').value_counts()

In [None]:
# drivewheel: frontwheel, rarewheel or four-wheel drive 
cars['drivewheel'].astype('category').value_counts()

In [None]:
# target variable: price of car
sns.distplot(cars['price'])
plt.show()

In [None]:
# all numeric (float and int) variables in the dataset
cars_numeric = cars.select_dtypes(include=['float64', 'int'])
cars_numeric.head()

In [None]:
# dropping symboling and car_ID 
cars_numeric = cars_numeric.drop(['symboling', 'car_ID'], axis=1)
cars_numeric.head()

In [None]:
# correlation matrix
cor = cars_numeric.corr()
cor

In [None]:
# plotting correlations on a heatmap

# figure size
plt.figure(figsize=(16,8))

# heatmap
sns.heatmap(cor, cmap="YlGnBu", annot=True)
plt.show()

Data Cleaning

In [None]:
# converting symboling to categorical
cars['symboling'] = cars['symboling'].astype('object')
cars.info()

In [None]:
import re

# regex: any alphanumeric sequence before a space, may contain a hyphen
p = re.compile(r'\w+-?\w+')
carnames = cars['CarName'].apply(lambda x: re.findall(p, x)[0])
print(carnames)

In [15]:
# New column car_company
cars['car_company'] = cars['CarName'].apply(lambda x: re.findall(p, x)[0])

In [None]:
# look at all values 
cars['car_company'].astype('category').value_counts()

In [17]:
# replacing misspelled car_company names

# volkswagen
cars.loc[(cars['car_company'] == "vw") | 
         (cars['car_company'] == "vokswagen")
         , 'car_company'] = 'volkswagen'

# porsche
cars.loc[cars['car_company'] == "porcshce", 'car_company'] = 'porsche'

# toyota
cars.loc[cars['car_company'] == "toyouta", 'car_company'] = 'toyota'

# nissan
cars.loc[cars['car_company'] == "Nissan", 'car_company'] = 'nissan'

# mazda
cars.loc[cars['car_company'] == "maxda", 'car_company'] = 'mazda'

In [18]:
# drop carname variable
cars = cars.drop('CarName', axis=1)


In [None]:
cars.info()

In [None]:
# outliers
cars.describe()

Data Preparation

In [21]:
# split into X and y
X = cars.loc[:, ['symboling', 'fueltype', 'aspiration', 'doornumber',
       'carbody', 'drivewheel', 'enginelocation', 'wheelbase', 'carlength',
       'carwidth', 'carheight', 'curbweight', 'enginetype', 'cylindernumber',
       'enginesize', 'fuelsystem', 'boreratio', 'stroke', 'compressionratio',
       'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'car_company']]

y = cars['price']

In [None]:
# creating dummy variables for categorical variables

# subset all categorical variables
cars_categorical = X.select_dtypes(include=['object'])
cars_categorical.head()

In [None]:
# convert into dummies
cars_dummies = pd.get_dummies(cars_categorical, drop_first=True)
cars_dummies.head()

In [24]:
# drop categorical variables 
X = X.drop(list(cars_categorical.columns), axis=1)

In [25]:
# concat dummy variables with X
X = pd.concat([X, cars_dummies], axis=1)

In [None]:
# scaling the features
from sklearn.preprocessing import scale

# storing column names in cols, since column names are (annoyingly) lost after 
# scaling (the df is converted to a numpy array)
cols = X.columns
X = pd.DataFrame(scale(X))
X.columns = cols
X.columns

In [27]:
# split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

Model Building and Evalution

In [None]:
# Building the first model with all the features

# instantiate
lm = LinearRegression()

# fit
lm.fit(X_train, y_train)

In [None]:
# print coefficients and intercept
print(lm.coef_)
print(lm.intercept_)

Output

In [30]:
# predict 
y_pred = lm.predict(X_test)

# metrics
from sklearn.metrics import r2_score

print("r2 value :\n",r2_score(y_true=y_test, y_pred=y_pred))
print("\n")
print("Test data price prediction :\n",y_pred)


r2 value :
 0.838262139339907


Test data price prediction :
 [ 8581.68389891  9512.49895049 10772.03109221  8884.52027714
  5238.40555789 10613.85176559 16322.50986552 12988.08765548
 17424.77477384 16723.40495462 19726.36536991 15619.17741099
 15000.23646241  8188.41554271 47047.04701824  7589.957387
  7758.67740424 15032.2145857  12857.8795648  13777.37062632
 17246.15860969 17243.31766033 34028.          5293.72918867
 11941.87002288 14487.7170506  14983.28576094 25966.90558453
 20200.64539873 10854.06473978  5973.38458214 33406.8456167
 16757.62194314 21665.7932257  11325.14449017 12053.22847161
 34905.10848369 12092.95030173  6459.0853351   9873.89833226
 43596.62426288 12305.60541374  7449.76616094  6740.73552466
  3827.09409615 11000.4378225   7015.177847    8229.98173797
  9239.33803817 10444.51148012  6698.79869848 11923.23136008
  7453.47571736  9146.59937032 20306.03664936  7060.97505508
  6367.30200206  6905.46960415 14164.38681051  8755.07698474
  6835.73775641 38205.4933

In [None]:
# Error terms
c = [i for i in range(len(y_pred))]
fig = plt.figure()
plt.plot(c,y_test-y_pred, color="blue", linewidth=2.5, linestyle="-")
fig.suptitle('Error Terms', fontsize=20)              # Plot heading 
plt.xlabel('Index', fontsize=18)                      # X-label
plt.ylabel('ytest-ypred', fontsize=16)                # Y-label
plt.show()

In [None]:
# Plotting the error terms to understand the distribution.
fig = plt.figure()
sns.distplot((y_test-y_pred),bins=50)
fig.suptitle('Error Terms', fontsize=20)                  # Plot heading 
plt.xlabel('y_test-y_pred', fontsize=18)                  # X-label
plt.ylabel('Index', fontsize=16)                          # Y-label
plt.show()

In [None]:
sns.distplot(cars['price'],bins=50)
plt.show()