In [2]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from rdatasets import data as rdata

In [3]:
# Load the 'cars' dataset
cars_data = sm.datasets.get_rdataset("mtcars", "datasets").data

In [4]:
cars_data.head()

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [5]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, Mazda RX4 to Volvo 142E
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mpg     32 non-null     float64
 1   cyl     32 non-null     int64  
 2   disp    32 non-null     float64
 3   hp      32 non-null     int64  
 4   drat    32 non-null     float64
 5   wt      32 non-null     float64
 6   qsec    32 non-null     float64
 7   vs      32 non-null     int64  
 8   am      32 non-null     int64  
 9   gear    32 non-null     int64  
 10  carb    32 non-null     int64  
dtypes: float64(5), int64(6)
memory usage: 3.0+ KB


In [6]:
cars_data.describe()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


### Step 2 Fit Multivariate the model

In [7]:
# Define predictors (X) and response variables (Y)
X = cars_data[['wt']]  # Predictor: weight of the car
Y = cars_data[['mpg', 'hp']]  # Response variables: mpg and hp

# In this part, we have one independent and one or more dependent
# Initialize and fit the multivariate regression model
multi_reg = LinearRegression().fit(X, Y)

# Extract coefficients and intercepts
print("Intercepts:", multi_reg.intercept_)
print("Coefficients:", multi_reg.coef_)

Intercepts: [37.28512617 -1.82092177]
Coefficients: [[-5.34447157]
 [46.16005028]]


### Step 3 Model the evaluation

In [8]:
# Predict the response variables
Y_pred = multi_reg.predict(X)

# Compute R-squared for each response variable
r2_mpg = r2_score(Y['mpg'], Y_pred[:, 0])  # R-squared for mpg
r2_hp = r2_score(Y['hp'], Y_pred[:, 1])  # R-squared for hp

print(f"R-squared for mpg: {r2_mpg:.3f}")
print(f"R-squared for hp: {r2_hp:.3f}")

R-squared for mpg: 0.753
R-squared for hp: 0.434


### Step 4: Add Quadartic Terms

In [10]:
# Add quadratic term: weight squared
cars_data['wt_squared'] = cars_data['wt'] ** 2


# Update predictors
X_quad = cars_data[['wt', 'wt_squared']]


# Fit the updated model
multi_reg_quad = LinearRegression().fit(X_quad, Y)


# Extract coefficients and intercepts
print("Updated Intercepts:", multi_reg_quad.intercept_)
print("Updated Coefficients:", multi_reg_quad.coef_)


Updated Intercepts: [ 49.93081095 -76.73441384]
Updated Coefficients: [[-13.38033708   1.17108689]
 [ 93.76480697  -6.93756093]]


### Step 5 Model comparing  R2

In [11]:
# Predict using the updated model
Y_pred_quad = multi_reg_quad.predict(X_quad)

# Compute R-squared for the updated model
r2_mpg_quad = r2_score(Y['mpg'], Y_pred_quad[:, 0])
r2_hp_quad = r2_score(Y['hp'], Y_pred_quad[:, 1])

print(f"Updated R-squared for mpg: {r2_mpg_quad:.3f}")
print(f"Updated R-squared for hp: {r2_hp_quad:.3f}")

Updated R-squared for mpg: 0.819
Updated R-squared for hp: 0.452


1) R-squared for Base Model: Indicates the performance of the model using only wt as the predictor.
2) R-squared for Quadratic Model: Measures the performance when the quadratic term (wt²) is included.


Note: If the R-squared values improve for the quadratic model, it suggests that the addition of the nonlinear term improves the model's explanatory power while accounting for model complexity.

### Step 6 Final Model

In [12]:
# Create a dictionary to store the values
data = {
   'Metric': ['R-squared for mpg', 'R-squared for hp'],
   'GLM Model': [r2_mpg, r2_hp],
   'Quadractic Model': [r2_mpg_quad, r2_hp_quad]
}


# Create a pandas DataFrame
df = pd.DataFrame(data)


# Display the DataFrame as a table
print(df.to_markdown(index=False))

| Metric            |   GLM Model |   Quadractic Model |
|:------------------|------------:|-------------------:|
| R-squared for mpg |    0.752833 |           0.819061 |
| R-squared for hp  |    0.433949 |           0.451908 |
