# Multivariate Regression

### Libraries

In [1]:
import numpy as np
from numpy.core.defchararray import add
import pandas as pd
import os
import plotly
import plotly.express as px
from functools import reduce
import sklearn

np.set_printoptions(threshold=np.inf)

pd.options.display.float_format = '{:.4f}'.format

path = 'C:/Users/delightb/Desktop/Final_Project/Senior-Project/Pickle'
####   = pd.read_pickle(os.path.join(path,'abvHouse.pkl'))

In [2]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Checking the Variance Inflation Factor (VIF)
$$ V.I.F. = 1 / (1 - R^2). $$
https://etav.github.io/python/vif_factor_python.html

_Steps for Implementing VIF_
1. Run a multiple regression
2. Calculate the VIF factors
3. Inspect the factors for each predictor variable, if the VIF is between 5-10, multicolinearity is likely present and you should consider dropping the variable.

### Step 1: Run a multiple regression

In [3]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [4]:
from sklearn.linear_model import LinearRegression

df = pd.read_pickle(os.path.join(path,'oneHotEncode.pkl'))
df = df.drop(columns = ['Year'])
#df = clean_dataset(df)

lm_df = df.copy()

feature_cols = df.columns.get_values()
feature_cols = feature_cols.tolist()
feature_cols.remove('TAV_Ch')


y = lm_df.TAV_Ch
X = lm_df[feature_cols]

lm = LinearRegression()
lm.fit(X, y)


# # print intercept and coefficients
print(lm.intercept_)
print(lm.coef_)

# pair the feature names with the coefficients
#print('This pairs names with coef: ',zip(feature_cols, lm.coef_))

# predict for a new observation
#lm.predict([100, 25, 25])

# calculate the R-squared
print('R-squared score: ',lm.score(X, y))

0.019773711422831117
[ 1.70082054e-04  9.81572249e-04 -1.97234575e-03 -1.29849044e-08
  3.94135441e-02  1.77097184e-01  1.35842018e-01 -1.07286238e-01
 -4.23380777e-02  3.23513460e-02  2.18835616e-02 -2.83692346e-01
 -1.07745209e-02 -2.02993831e-02  2.31187756e-01  9.96609280e-02
  1.34812809e-01 -4.33944704e-02 -1.10975182e-01  3.34730264e-01
  1.28713449e-01 -6.23216166e-03  2.84405209e-01 -2.65330919e-02
  1.35866663e-01  1.35871388e-01 -1.44180628e-01 -4.21160878e-02
  3.48740785e-02 -4.41057681e-02  3.74156133e-03  1.75009598e-02
  8.56752215e-02  4.96108388e-02  8.53003283e-02  4.61625915e-03
 -6.66919637e-03 -4.73497238e-03  8.05822185e-03 -6.34451154e-03
  7.92452490e-03  4.97301370e-04  1.86298008e-03 -1.66274640e-03
 -1.09845395e-03  4.10599368e-04 -2.86000608e-03]
R-squared score:  0.6876177760166224


### Step 2: Calculate VIF Factors

In [5]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns


divide by zero encountered in double_scalars



### Step 3: Inspect VIF Factors

In [None]:
vif.round(1)

In [None]:
zip(feature_cols, lm.coef_)