<a href="https://colab.research.google.com/github/agkloth/python_projects/blob/main/multiple_linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing packages

import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm

from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize , poly)

# Simple Linear Regression

Boston = load_data("Boston")
print(Boston.columns)

X = pd.DataFrame({'intercept': np.ones(Boston.shape[0]),'lstat': Boston['lstat']})
print(X[:4])
# creating new DataFrame using pandas library
# intercept: creating column called 'intercept'; assigns an array of ones
# (using np.ones()) with the same number of rows as Boston DataFrame
# purpose of adding this column of ones is to act as the intercept in regression model.
# Boston.shape[0] gives number of rows in the Boston dataset

y=Boston['medv']
model = sm.OLS(y,X)
results=model.fit()
#extracting response variable medv
#sm.OLS specifies model
#model.fit() does actual fitting

summarize(results)

# Prediction and Confidence Intervals

new_df = pd.DataFrame({'lstat':[5, 10, 15]})
design = MS(['lstat'])
X = design.fit_transform(Boston)
newX = design.transform(new_df)
print(newX)
# create a new data frame for variable lstat with values for this
# variable we want to make predictions
# MS stands for ModelSpec, function from ISLP model packages
# argument ['lstat'] means that specifying 'lstat' as predictor variable in design matrix
# design is an object of ModelSpec class; contains info on which columns to be
# used
# fit_transform() method looks at data to see which columns are present and does transformations
# now design object fitted with boston data
# and transform () method creates correspoding corresponding matrix model

new_predictions = results.get_prediction(newX)
new_predictions.predicted_mean
# compute predictions at newX and view them by using predicted_mean attribute

new_predictions.conf_int(alpha=0.05)
# producing confidence intervals for predicted values

new_predictions.conf_int(obs=True, alpha=0.05)
# producing prediction intervals for predicted values

# Multiple Linear Regression

X = MS(['lstat', 'age']).fit_transform(Boston)
model1 = sm.OLS(y, X)
results1 = model1.fit()
summarize(results1)
#use ModelSpec() transform to construct required model matrix and response

terms = Boston.columns.drop('medv')
terms
# short-cut way to perform regression using all predictors; instead, taking out
# response variable

X=MS(terms).fit_transform(Boston)
model2 = sm.OLS(y,X)
results2 = model2.fit()
summarize(results2)

# Qualitative Predictors

Carseats = load_data('Carseats')
print(Carseats.columns)
# predictor ShelveLoc takes on three possible values, Bad, Medium, and Good
# given qualitative variable ModelSpec() generates dummy variables automatically
# their columns sum to one, so to avoid collinearity with an intercept,
# the first column is dropped

allvars = list(Carseats.columns.drop('Sales'))
y = Carseats['Sales']
final = allvars + [('Income', 'Advertising'),
('Price', 'Age')]
X = MS(final).fit_transform(Carseats)
model = sm.OLS(y, X)
summarize(model.fit())
# removing Sales as it is our response variable we trying to predict
# turning column into python list
# assigning Sales column as response variable
# concatenating allvars and two tuples
# two tuples correspond to two interaction variables
# .fit() method estimates coefficients for each predictor variable
# by minimizing the error between the predicted and actual values of response
# variable
# in the output, you will see column ShelveLoc[Bad] dropped, since it is the
# first level of ShelveLoc

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'lstat', 'medv'],
      dtype='object')
   intercept  lstat
0        1.0   4.98
1        1.0   9.14
2        1.0   4.03
3        1.0   2.94
   intercept  lstat
0        1.0      5
1        1.0     10
2        1.0     15
Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
       'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],
      dtype='object')


Unnamed: 0,coef,std err,t,P>|t|
intercept,6.5756,1.009,6.519,0.0
CompPrice,0.0929,0.004,22.567,0.0
Income,0.0109,0.003,4.183,0.0
Advertising,0.0702,0.023,3.107,0.002
Population,0.0002,0.0,0.433,0.665
Price,-0.1008,0.007,-13.549,0.0
ShelveLoc[Good],4.8487,0.153,31.724,0.0
ShelveLoc[Medium],1.9533,0.126,15.531,0.0
Age,-0.0579,0.016,-3.633,0.0
Education,-0.0209,0.02,-1.063,0.288


In [5]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm

from ISLP import load_data

# Fit a multiple regression model to predict Sales using Price, Urban, and US.
Carseats = load_data('Carseats')
print(Carseats.columns)

Carseats['Urban'] = Carseats['Urban'].map({'Yes': 1, 'No': 0})
Carseats['US'] = Carseats['US'].map({'Yes': 1, 'No':0})
X = Carseats[['Price', 'Urban', 'US']]
y = Carseats['Sales']

X = sm.add_constant(X)
model1 = sm.OLS(y, X)
results1 = model1.fit()

print(results1.summary())


Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
       'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],
      dtype='object')
                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.239
Model:                            OLS   Adj. R-squared:                  0.234
Method:                 Least Squares   F-statistic:                     41.52
Date:                Wed, 11 Sep 2024   Prob (F-statistic):           2.39e-23
Time:                        12:52:02   Log-Likelihood:                -927.66
No. Observations:                 400   AIC:                             1863.
Df Residuals:                     396   BIC:                             1879.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t| 

In [4]:
pip install islp

Collecting islp
  Downloading ISLP-0.4.0-py3-none-any.whl.metadata (7.0 kB)
Collecting lifelines (from islp)
  Downloading lifelines-0.29.0-py3-none-any.whl.metadata (3.2 kB)
Collecting pygam (from islp)
  Downloading pygam-0.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pytorch-lightning (from islp)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics (from islp)
  Downloading torchmetrics-1.4.1-py3-none-any.whl.metadata (20 kB)
Collecting autograd-gamma>=0.3 (from lifelines->islp)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines->islp)
  Downloading formulaic-1.0.2-py3-none-any.whl.metadata (6.8 kB)
Collecting scipy>=0.9 (from islp)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.3 MB/s[0m eta [36m