In [1]:
#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st

In [2]:
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
# Let's check the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null object
weight          398 non-null int64
acceleration    398 non-null float64
model year      398 non-null int64
origin          398 non-null int64
car name        398 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [4]:
# Check the shape of the DataFrame (rows, columns)
df.shape

(398, 9)

In [5]:
# Let's look at some statistical information about the dataframe.
df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [6]:
# Replacing the categorical var with actual values.
df['origin'] = df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,america,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,america,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,america,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,america,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,america,ford torino


# Exploratory Data Analysis

## 1.Create Dummy Variables

In [7]:
df = pd.get_dummies(df,columns=['origin'])
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,car name,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,chevrolet chevelle malibu,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,buick skylark 320,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,plymouth satellite,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,amc rebel sst,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,ford torino,1,0,0


## 2.Dealing with Missing Values

In [8]:
df.isna().sum()*100/398

mpg               0.0
cylinders         0.0
displacement      0.0
horsepower        0.0
weight            0.0
acceleration      0.0
model year        0.0
car name          0.0
origin_america    0.0
origin_asia       0.0
origin_europe     0.0
dtype: float64

## 3.Cleaning the Data
Removing ? from hp and converting its data type to INT

In [9]:
# Missing values have a '?'

# Replace missing values with NaN
df['horsepower'] = df['horsepower'].replace('?',np.nan)

#Converting data type from string to float
df['horsepower'] = df['horsepower'].astype(float)

#lets replace the missing values with median value.
df['horsepower'] = df['horsepower'].replace(np.nan,df['horsepower'].median())

#Converting data type from float to int
df['horsepower'] = df['horsepower'].astype(int)

# Preparing X and y

In [10]:
#drop car name
df = df.drop('car name',axis = 1)

In [11]:
# lets build our linear model
# independant variables
# the dependent variable
y = df['mpg']
X = df.drop(['mpg','origin_europe'],axis = 1)

# Splitting Data into Training and Testing Sets

In [12]:
# Split X and y into training and test set in 70:30 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1)

In [13]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(278, 8)
(278,)
(120, 8)
(120,)


# Fit Linear Model

In [14]:
# import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Representing LinearRegression as lr(Creating LinearRegression Object)
lr = LinearRegression()

# Fit the model using lr.fit()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Coefficients Calculation

In [15]:
# Print the Coefficients
print(lr.coef_)

[-0.39474991  0.02894837 -0.02177415 -0.00735148  0.0618036   0.83691196
 -3.00171567 -0.60573691]


In [21]:
#Print the intercept
print(lr.intercept_)

-18.280167206942483


# Create a regularized RIDGE model and note the coefficients

In [19]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [-0.39172117  0.02873297 -0.02168098 -0.00734619  0.06154672  0.83597572
 -2.96046703 -0.57890908]


# Create a regularized LASSO model and note the coefficients

In [20]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [-0.          0.01667624 -0.02100086 -0.00709245  0.01768383  0.8045868
 -1.75545136 -0.        ]


# Let us compare their scores