In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA


In [4]:
dsn = pd.read_csv('Regression_Dataset.csv')
dsn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     200 non-null    int64  
 1   Income                  200 non-null    int64  
 2   Education_Years         200 non-null    int64  
 3   Savings                 200 non-null    float64
 4   Existing_Loan_Amount    200 non-null    float64
 5   Monthly_Expenses        200 non-null    float64
 6   House_Loan_Eligibility  200 non-null    float64
dtypes: float64(4), int64(3)
memory usage: 11.1 KB


In [12]:

# Splitting the data into training and testing sets
X = dsn.drop(columns=['House_Loan_Eligibility'])
y = dsn['House_Loan_Eligibility']



In [9]:
# Checking multicollinearity using Variance Inflation Factor (VIF)
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    return vif_data

vif_df = calculate_vif(X)
print("Variance Inflation Factor (VIF) Analysis:")
print(vif_df)

# Flagging variables with high VIF (threshold > 5)
high_vif_features = vif_df[vif_df["VIF"] > 5]["Feature"].tolist()
print(f"Features with high multicollinearity: {high_vif_features}")

Variance Inflation Factor (VIF) Analysis:
                Feature         VIF
0                   Age    8.879676
1                Income  106.049032
2       Education_Years   27.216054
3               Savings    7.902598
4  Existing_Loan_Amount   61.984772
5      Monthly_Expenses   96.216447
Features with high multicollinearity: ['Age', 'Income', 'Education_Years', 'Savings', 'Existing_Loan_Amount', 'Monthly_Expenses']


In [11]:
# Applying PCA since all features have high VIF
pca = PCA(n_components=0.95)  # Preserve 95% variance
X_pca = pca.fit_transform(X)

In [20]:
# Print explained variance ratio
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
print("Explained Variance Ratio by PCA Components:")
print(explained_variance)
print("Cumulative Explained Variance:")
print(cumulative_variance)

Explained Variance Ratio by PCA Components:
[0.84970318 0.10909126]
Cumulative Explained Variance:
[0.84970318 0.95879444]


In [13]:
# Splitting the PCA-transformed data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [19]:
# Convert X_train to DataFrame and print
X_train_df = pd.DataFrame(X_train)
print("Transformed X_train after PCA:")
print(X_train_df.head())

Transformed X_train after PCA:
              0             1
0 -29303.860130  -1964.188299
1  37499.209274  -3425.750566
2   7985.927325   1285.588102
3  -4136.928023 -11158.101178
4  38971.963599   1877.761045


In [14]:
# Training a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [16]:
# Making predictions
y_pred = model.predict(X_test)

In [17]:
# Evaluating the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display results
print("Regression Model Performance with PCA:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-Squared: {r2:.4f}")


Regression Model Performance with PCA:
Mean Absolute Error (MAE): 18460.09
Mean Squared Error (MSE): 530994814.83
R-Squared: 0.8796
