In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv('Cars93.csv')

In [8]:
# select variables

In [9]:
df1 = df[['MPG.city','Cylinders','EngineSize','Horsepower','RPM','Passengers','Weight']]
df1[:5]

Unnamed: 0,MPG.city,Cylinders,EngineSize,Horsepower,RPM,Passengers,Weight
0,25,4,1.8,140,6300,5,2705
1,18,6,3.2,200,5500,5,3560
2,20,6,2.8,172,5500,5,3375
3,19,6,2.8,172,5500,6,3405
4,22,4,3.5,208,5700,4,3640


In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 7 columns):
MPG.city      93 non-null int64
Cylinders     93 non-null object
EngineSize    93 non-null float64
Horsepower    93 non-null int64
RPM           93 non-null int64
Passengers    93 non-null int64
Weight        93 non-null int64
dtypes: float64(1), int64(5), object(1)
memory usage: 5.2+ KB


In [11]:
# Cylinders is not numeric

In [12]:
df1.Cylinders.unique()

array(['4', '6', '8', '3', 'rotary', '5'], dtype=object)

In [13]:
pd.value_counts(df1.Cylinders)

4         49
6         31
8          7
3          3
5          2
rotary     1
Name: Cylinders, dtype: int64

In [14]:
df[df.Cylinders == 'rotary']

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
56,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,,,2895,non-USA,Mazda RX-7


In [15]:
# remove Mazda RX-7

In [16]:
df2 = df1.copy()

In [17]:
df3 = df2.drop(df2.index[56])

In [18]:
df3.iloc[52:58,]

Unnamed: 0,MPG.city,Cylinders,EngineSize,Horsepower,RPM,Passengers,Weight
52,29,4,1.6,82,5000,4,2325
53,28,4,1.8,103,5500,5,2440
54,26,4,2.5,164,5600,5,2970
55,18,6,3.0,155,5000,7,3735
57,20,4,2.3,130,5100,5,2920
58,19,6,3.2,217,5500,5,3525


In [19]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92 entries, 0 to 92
Data columns (total 7 columns):
MPG.city      92 non-null int64
Cylinders     92 non-null object
EngineSize    92 non-null float64
Horsepower    92 non-null int64
RPM           92 non-null int64
Passengers    92 non-null int64
Weight        92 non-null int64
dtypes: float64(1), int64(5), object(1)
memory usage: 5.8+ KB


In [20]:
# change Cylinders to numeric

In [21]:
df3.Cylinders = df3.Cylinders.astype('int64')

In [22]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92 entries, 0 to 92
Data columns (total 7 columns):
MPG.city      92 non-null int64
Cylinders     92 non-null int64
EngineSize    92 non-null float64
Horsepower    92 non-null int64
RPM           92 non-null int64
Passengers    92 non-null int64
Weight        92 non-null int64
dtypes: float64(1), int64(6)
memory usage: 5.8 KB


In [23]:
# correlations

In [24]:
df3.corr()

Unnamed: 0,MPG.city,Cylinders,EngineSize,Horsepower,RPM,Passengers,Weight
MPG.city,1.0,-0.687222,-0.734536,-0.670457,0.395603,-0.473828,-0.850998
Cylinders,-0.687222,1.0,0.890732,0.794648,-0.396617,0.308475,0.794336
EngineSize,-0.734536,0.890732,1.0,0.790067,-0.535751,0.350297,0.849277
Horsepower,-0.670457,0.794648,0.790067,1.0,-0.011519,0.084771,0.765391
RPM,0.395603,-0.396617,-0.535751,-0.011519,1.0,-0.431406,-0.431383
Passengers,-0.473828,0.308475,0.350297,0.084771,-0.431406,1.0,0.57214
Weight,-0.850998,0.794336,0.849277,0.765391,-0.431383,0.57214,1.0


In [25]:
# best predictor based on correlation matrix is Weight (second is EngineSize)

In [26]:
# what predictors are correlated? cylinders and engine size

In [28]:
## select predicto columns and the response

In [30]:
x0 = df3.iloc[:,1:7]

In [32]:
# selecting 6 predictors

In [33]:
# df.loc = selecting by row/col name; df.iloc = selecting by row/col number; df.at & df.iat

In [35]:
y0 = df3.iloc[:,0]

In [36]:
## model

In [37]:
import statsmodels.api as sm

In [44]:
x1 = sm.add_constant(x0)

  return ptp(axis=axis, out=out, **kwargs)


In [45]:
x = x0.to_numpy()

In [47]:
x1 = sm.add_constant(x)

In [48]:
m1 = sm.OLS

In [49]:
## fitted equation

In [50]:
# yhat = 36.92 + 0.1014 Cylinders + 0.874 EngineSize - 0.03 Horsepower + 0.0016 RPM - 0.238 Passengers - 0.0066 Weight

In [51]:
# interpret equation

In [52]:
# city mileage (miles per gallon in creases by 0.1014 for each additional cylinder)

In [53]:
# average city mileage decreases by 0.0066 for each additional pound

In [54]:
## model adequacy values

In [55]:
m1.rsquared

AttributeError: type object 'OLS' has no attribute 'rsquared'

In [56]:
# MSE

In [57]:
m1.rsquared_adj

AttributeError: type object 'OLS' has no attribute 'rsquared_adj'

In [58]:
adjr2 = m1.rsquared_adj

AttributeError: type object 'OLS' has no attribute 'rsquared_adj'

In [62]:
MST = np.var(y0)
MST

31.265949905482014

In [63]:
MST = np.var(y0, ddof = 1)
MST

31.609531772575224

In [64]:
# the adjustment is the denominator

In [65]:
MSE = MST * (1 - adjr2)
MSE

NameError: name 'adjr2' is not defined

In [67]:
# MSE is the estimted variance of city mileage
# it is 9.08 squared-miles

In [68]:
# S

In [69]:
S = np.sqrt(MSE)

NameError: name 'MSE' is not defined

In [70]:
# average distance to the fitted plane is 3.0138 miles

In [71]:
## predictions

In [72]:
# create dataframe with predictor values using a dictionary

In [76]:
values = {'constant':[1],'Cylinders':[4],'EngineSize':[2.3],'Horsepower':[200],'RPM':[5500],'Passengers':[4],'Weight':[2950]}

In [77]:
x = pd.DataFrame(values)
x

Unnamed: 0,constant,Cylinders,EngineSize,Horsepower,RPM,Passengers,Weight
0,1,4,2.3,200,5500,4,2950


In [79]:
d2 = m1.get_prediction(x)

AttributeError: type object 'OLS' has no attribute 'get_prediction'

In [81]:
d2.summary_frame(alpha=0.10)
# 90% CI

NameError: name 'd2' is not defined

In [None]:
# avg = 21.72 CI = [19,24]