# Multivariate Regression
#### Multivariate Regression is a method used to measure the degree at which more than one independent variable (predictors) and more than one dependent variable (responses), are linearly related

In [8]:
import pandas as pd
import statsmodels.api as sm

In [2]:
df = pd.read_csv('cars.csv')
df

Unnamed: 0,Price,Mileage,Make,Model,Trim,Type,Cylinder,Liter,Doors,Cruise,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,17314.10313,8221,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,
1,17542.03608,9135,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,
2,16218.84786,13196,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,
3,16336.91314,16342,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,
4,16339.17032,19832,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799,16507.07027,16229,Saturn,L Series,L300 Sedan 4D,Sedan,6,3.0,4,1,...,,,,,,,,,,
800,16175.95760,19095,Saturn,L Series,L300 Sedan 4D,Sedan,6,3.0,4,1,...,,,,,,,,,,
801,15731.13290,20484,Saturn,L Series,L300 Sedan 4D,Sedan,6,3.0,4,1,...,,,,,,,,,,
802,15118.89323,25979,Saturn,L Series,L300 Sedan 4D,Sedan,6,3.0,4,1,...,,,,,,,,,,


In [3]:
df.describe()

Unnamed: 0,Price,Mileage,Cylinder,Liter,Doors,Cruise,Sound,Leather,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
count,804.0,804.0,804.0,804.0,804.0,804.0,804.0,804.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,21343.143767,19831.93408,5.268657,3.037313,3.527363,0.752488,0.679104,0.723881,,,,,,,,,,,,
std,9884.852801,8196.319707,1.387531,1.105562,0.850169,0.431836,0.467111,0.447355,,,,,,,,,,,,
min,8638.930895,266.0,4.0,1.6,2.0,0.0,0.0,0.0,,,,,,,,,,,,
25%,14273.073875,14623.5,4.0,2.2,4.0,1.0,0.0,0.0,,,,,,,,,,,,
50%,18024.99502,20913.5,6.0,2.8,4.0,1.0,1.0,1.0,,,,,,,,,,,,
75%,26717.316635,25213.0,6.0,3.8,4.0,1.0,1.0,1.0,,,,,,,,,,,,
max,70755.46672,50387.0,8.0,6.0,4.0,1.0,1.0,1.0,,,,,,,,,,,,


In [4]:
df.head()

Unnamed: 0,Price,Mileage,Make,Model,Trim,Type,Cylinder,Liter,Doors,Cruise,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,17314.10313,8221,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,
1,17542.03608,9135,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,
2,16218.84786,13196,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,
3,16336.91314,16342,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,
4,16339.17032,19832,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,...,,,,,,,,,,


In [5]:
df.Model.unique()

array(['Century', 'Lacrosse', 'Lesabre', 'Park Avenue', 'CST-V', 'CTS',
       'Deville', 'STS-V6', 'STS-V8', 'XLR-V8', 'AVEO', 'Cavalier',
       'Classic', 'Cobalt', 'Corvette', 'Impala', 'Malibu', 'Monte Carlo',
       'Bonneville', 'G6', 'Grand Am', 'Grand Prix', 'GTO', 'Sunfire',
       'Vibe', '9_3', '9_3 HO', '9_5', '9_5 HO', '9-2X AWD', 'Ion',
       'L Series'], dtype=object)

In [6]:
df.Model.value_counts()

Malibu         60
Cavalier       60
AVEO           60
Ion            50
Cobalt         50
9_3 HO         40
9_5            30
Lacrosse       30
Grand Prix     30
Bonneville     30
Deville        30
Impala         30
Vibe           30
Monte Carlo    30
Park Avenue    20
9_3            20
Lesabre        20
Grand Am       20
G6             20
9_5 HO         20
Corvette       20
Century        10
CTS            10
GTO            10
Classic        10
Sunfire        10
STS-V6         10
L Series       10
STS-V8         10
CST-V          10
XLR-V8         10
9-2X AWD        4
Name: Model, dtype: int64

### Note:
##### how we use pandas.Categorical to convert textual category data (model name) into an ordinal number that we can work with.

In [7]:
pd.Categorical(df.Model).codes[0:20]

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 22, 22,
       22, 22, 22], dtype=int8)

In [10]:
df['Model_ord'] = pd.Categorical(df.Model).codes  #values for Model so that they can participate in Stats!

X = df[['Mileage', 'Model_ord', 'Doors']]  #multiple independent variable
y = df[['Price']] #dependent variable

In [13]:
#https://www.statsmodels.org/stable/api.html#statsmodels-api
    
#X1 = sm.add_constant(X)   #Add a column of ones to an array.
check = sm.OLS(y, X).fit() #Ordinary Least Squares   

In [14]:
check.summary()

0,1,2,3
Dep. Variable:,Price,R-squared (uncentered):,0.768
Model:,OLS,Adj. R-squared (uncentered):,0.767
Method:,Least Squares,F-statistic:,883.1
Date:,"Sat, 31 Jul 2021",Prob (F-statistic):,1.95e-253
Time:,21:09:02,Log-Likelihood:,-8646.5
No. Observations:,804,AIC:,17300.0
Df Residuals:,801,BIC:,17310.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Mileage,0.1700,0.043,3.963,0.000,0.086,0.254
Model_ord,203.2116,43.022,4.723,0.000,118.763,287.660
Doors,3923.7534,281.702,13.929,0.000,3370.792,4476.715

0,1,2,3
Omnibus:,221.757,Durbin-Watson:,0.176
Prob(Omnibus):,0.0,Jarque-Bera (JB):,524.713
Skew:,1.465,Prob(JB):,1.15e-114
Kurtosis:,5.661,Cond. No.,15100.0


In [4]:
y.groupby(df.Doors).mean()

Unnamed: 0_level_0,Price
Doors,Unnamed: 1_level_1
2,23807.13552
4,20580.670749


Surprisingly, more doors does not mean a higher price! So it's not surprising that it's pretty useless as a predictor here. This is a very small data set however, so we can't really read much meaning into it.

In [15]:
check = sm.GLS(y, X).fit()  #Generalized Least Squares
check.summary()

0,1,2,3
Dep. Variable:,Price,R-squared (uncentered):,0.768
Model:,GLS,Adj. R-squared (uncentered):,0.767
Method:,Least Squares,F-statistic:,883.1
Date:,"Sat, 31 Jul 2021",Prob (F-statistic):,1.95e-253
Time:,21:09:52,Log-Likelihood:,-8646.5
No. Observations:,804,AIC:,17300.0
Df Residuals:,801,BIC:,17310.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Mileage,0.1700,0.043,3.963,0.000,0.086,0.254
Model_ord,203.2116,43.022,4.723,0.000,118.763,287.660
Doors,3923.7534,281.702,13.929,0.000,3370.792,4476.715

0,1,2,3
Omnibus:,221.757,Durbin-Watson:,0.176
Prob(Omnibus):,0.0,Jarque-Bera (JB):,524.713
Skew:,1.465,Prob(JB):,1.15e-114
Kurtosis:,5.661,Cond. No.,15100.0


In [16]:
check = sm.WLS(y, X).fit()  #Weighted Least Squares.
check.summary()

0,1,2,3
Dep. Variable:,Price,R-squared (uncentered):,0.768
Model:,WLS,Adj. R-squared (uncentered):,0.767
Method:,Least Squares,F-statistic:,883.1
Date:,"Sat, 31 Jul 2021",Prob (F-statistic):,1.95e-253
Time:,21:10:26,Log-Likelihood:,-8646.5
No. Observations:,804,AIC:,17300.0
Df Residuals:,801,BIC:,17310.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Mileage,0.1700,0.043,3.963,0.000,0.086,0.254
Model_ord,203.2116,43.022,4.723,0.000,118.763,287.660
Doors,3923.7534,281.702,13.929,0.000,3370.792,4476.715

0,1,2,3
Omnibus:,221.757,Durbin-Watson:,0.176
Prob(Omnibus):,0.0,Jarque-Bera (JB):,524.713
Skew:,1.465,Prob(JB):,1.15e-114
Kurtosis:,5.661,Cond. No.,15100.0


In [17]:
check = sm.GLM(y, X).fit()  #Genaralized Linear Model.
check.summary()

0,1,2,3
Dep. Variable:,Price,No. Observations:,804.0
Model:,GLM,Df Residuals:,801.0
Model Family:,Gaussian,Df Model:,2.0
Link Function:,identity,Scale:,128890000.0
Method:,IRLS,Log-Likelihood:,-8646.5
Date:,"Sat, 31 Jul 2021",Deviance:,103240000000.0
Time:,21:13:29,Pearson chi2:,103000000000.0
No. Iterations:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Mileage,0.1700,0.043,3.963,0.000,0.086,0.254
Model_ord,203.2116,43.022,4.723,0.000,118.890,287.533
Doors,3923.7534,281.702,13.929,0.000,3371.628,4475.879
