In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from  sklearn.preprocessing import LabelEncoder


In [None]:
df = pd.read_csv('/content/ToyotaCorolla - MLR.csv')

In [None]:
df.head()

Unnamed: 0,Price,Age_08_04,KM,Fuel_Type,HP,Automatic,cc,Doors,Cylinders,Gears,Weight
0,13500,23,46986,Diesel,90,0,2000,3,4,5,1165
1,13750,23,72937,Diesel,90,0,2000,3,4,5,1165
2,13950,24,41711,Diesel,90,0,2000,3,4,5,1165
3,14950,26,48000,Diesel,90,0,2000,3,4,5,1165
4,13750,30,38500,Diesel,90,0,2000,3,4,5,1170


In [None]:
df.isnull().sum()

Unnamed: 0,0
Price,0
Age_08_04,0
KM,0
Fuel_Type,0
HP,0
Automatic,0
cc,0
Doors,0
Cylinders,0
Gears,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Price      1436 non-null   int64 
 1   Age_08_04  1436 non-null   int64 
 2   KM         1436 non-null   int64 
 3   Fuel_Type  1436 non-null   object
 4   HP         1436 non-null   int64 
 5   Automatic  1436 non-null   int64 
 6   cc         1436 non-null   int64 
 7   Doors      1436 non-null   int64 
 8   Cylinders  1436 non-null   int64 
 9   Gears      1436 non-null   int64 
 10  Weight     1436 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 123.5+ KB


In [None]:
lb = LabelEncoder()
df['Fuel_Type'] = lb.fit_transform(df['Fuel_Type'])

In [None]:
df['Fuel_Type'].value_counts()

Unnamed: 0_level_0,count
Fuel_Type,Unnamed: 1_level_1
Petrol,1264
Diesel,155
CNG,17


In [None]:
df.head()

Unnamed: 0,Price,Age_08_04,KM,Fuel_Type,HP,Automatic,cc,Doors,Cylinders,Gears,Weight
0,13500,23,46986,1,90,0,2000,3,4,5,1165
1,13750,23,72937,1,90,0,2000,3,4,5,1165
2,13950,24,41711,1,90,0,2000,3,4,5,1165
3,14950,26,48000,1,90,0,2000,3,4,5,1165
4,13750,30,38500,1,90,0,2000,3,4,5,1170


In [None]:
x = df.drop("Price", axis=1)
y = df["Price"]

In [None]:
x_const = sm.add_constant(x)

In [None]:
def calculate_vif(x):
    vif_data = pd.DataFrame()
    vif_data["feature"] = x.columns
    vif_data["VIF"] = [variance_inflation_factor(x.values, i)
                       for i in range(x.shape[1])]
    return vif_data

In [None]:
def build_evaluate_model(x, y, model_name="Model"):
    x_train, x_test, y_train, y_test = train_test_split(x ,y, test_size=0.2, random_state=42)


    x_train_const = sm.add_constant(x_train)
    x_test_const = sm.add_constant(x_test)
    model = sm.OLS(y_train, x_train_const).fit()
    y_pred = model.predict(x_test_const)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))


    print(f"\n📘 {model_name} Results")
    print("----------------------------")
    print(f"R2 Score: {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")

    print("\nModel Summary:")
    print(model.summary())

    return model, model.summary()

In [None]:
model, summary = build_evaluate_model(x, y, model_name="Linear Regression")



📘 Linear Regression Results
----------------------------
R2 Score: 0.843
RMSE: 1448.051

Model Summary:
                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.870
Model:                            OLS   Adj. R-squared:                  0.868
Method:                 Least Squares   F-statistic:                     842.5
Date:                Fri, 17 Oct 2025   Prob (F-statistic):               0.00
Time:                        04:47:32   Log-Likelihood:                -9866.5
No. Observations:                1148   AIC:                         1.975e+04
Df Residuals:                    1138   BIC:                         1.980e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------

In [None]:
print("\nVIF Values:")
vif_table = calculate_vif(x_const)
print(vif_table)


VIF Values:
     feature          VIF
0  Age_08_04     1.918394
1         KM     1.946100
2  Fuel_Type     2.379914
3         HP     1.488481
4  Automatic     1.062652
5         cc     1.168921
6      Doors     1.186610
7  Cylinders  1756.548275
8      Gears     1.113522
9     Weight     2.327003


In [None]:
# Identify weak features: (example rule)
# - High VIF > 10 (multicollinearity issue)
# - Very small coefficients (close to 0 impact)

In [None]:
model2,results2 = build_evaluate_model(x, y, "Model 2 (With weak features highlighted)")


📘 Model 2 (With weak features highlighted) Results
----------------------------
R2 Score: 0.843
RMSE: 1448.051

Model Summary:
                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.870
Model:                            OLS   Adj. R-squared:                  0.868
Method:                 Least Squares   F-statistic:                     842.5
Date:                Fri, 17 Oct 2025   Prob (F-statistic):               0.00
Time:                        04:48:08   Log-Likelihood:                -9866.5
No. Observations:                1148   AIC:                         1.975e+04
Df Residuals:                    1138   BIC:                         1.980e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---

In [None]:
#model : 3

In [None]:
weak_features = ["cc", "Doors","Cylinders","Gears"]
x_reduced = x.drop(weak_features, axis=1)

model3, results3 = build_evaluate_model(x_reduced, y, "Model 3 (Weak features removed)")


📘 Model 3 (Weak features removed) Results
----------------------------
R2 Score: 0.841
RMSE: 1454.865

Model Summary:
                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.868
Model:                            OLS   Adj. R-squared:                  0.868
Method:                 Least Squares   F-statistic:                     1255.
Date:                Fri, 17 Oct 2025   Prob (F-statistic):               0.00
Time:                        04:40:13   Log-Likelihood:                -9871.2
No. Observations:                1148   AIC:                         1.976e+04
Df Residuals:                    1141   BIC:                         1.979e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------

In [None]:
model_comparison_data = {
    'R-squared': [model.rsquared, model2.rsquared, model3.rsquared],
    'Adj. R-squared': [model.rsquared_adj, model2.rsquared_adj, model3.rsquared_adj],

}


comparison = pd.DataFrame(model_comparison_data,
                          index=['Model 1 (Linear Regression)',
                                 'Model 2 (Weak features highlighted)',
                                 'Model 3 (Weak features removed)'])

print("\nModel Comparison:")
print(comparison)


Model Comparison:
                                     R-squared  Adj. R-squared
Model 1 (Linear Regression)           0.869503        0.868471
Model 2 (Weak features highlighted)   0.869503        0.868471
Model 3 (Weak features removed)       0.868449        0.867757


In [None]:
#comparing the 3 models
comparison = pd.DataFrame([model, results2, results3])
print("\nModel Comparison:")
print(comparison)


Model Comparison:
                                                   0
0  <statsmodels.regression.linear_model.Regressio...
1                              OLS Regression Res...
2                              OLS Regression Res...
