# I. Problem 

## Initialization

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd

## Load dataset

In [5]:
df = pd.read_csv('house_pricing.csv')

## 1. Splitting the Data into Train and Test Sets

In [7]:
X = df.drop(columns=['house_price','HOUSE_ID'])
y = df['house_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Scaling the Training Data

In [9]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 3. Performing PCA on the Raw and Scaled Data

In [11]:
pca_model = PCA(n_components=0.95)
X_train_pca = pca_model.fit_transform(X_train_scaled)
X_test_pca = pca_model.transform(X_test_scaled)

## 4. Creating a Python Method for Model Training and Evaluation

In [13]:
def train_and_evaluate(X_train, y_train, X_test, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return model, y_pred, mse, r2, model.coef_, model.intercept_

## 5. Running Linear Regression on Different Datasets and Evaluating Performance

In [15]:
def summarize_model(model, y_pred, mse, r2, coef, intercept):
    print("Model Summary:")
    print(f"Mean Squared Error: {mse}")
    print(f"R² Score: {r2}")
    print(f"Intercept: {intercept}")
    print(f"Coefficients: {coef}")
    print("Predictions:", y_pred)

## 6. Comparison of Performance

In [17]:
# 1. Run with raw (unscaled) data
model_raw, y_pred_raw, mse_raw, r2_raw, coef_raw, intercept_raw = train_and_evaluate(X_train, y_train, X_test, y_test)
summarize_model(model_raw, y_pred_raw, mse_raw, r2_raw, coef_raw, intercept_raw)

Model Summary:
Mean Squared Error: 2075.226230461456
R² Score: 0.9460571801503285
Intercept: -1664.1037665329898
Coefficients: [ 1.96900471e+01 -1.73561238e+02 -1.22365598e-01  4.93599569e+02
 -2.15454532e+01]
Predictions: [667.68207744 690.30413749 814.4505887  496.63227608 362.59871914
 424.09446521 751.93867749 412.66614762 375.86612939 457.06954261
 733.46434236 535.79967844 423.80766458 540.01515685 686.70833058
 458.1650717  235.79039868 597.11701801 487.35394171 326.84761875
 878.80024261 213.61038803 789.21715001 342.000966   317.54066165
 457.59415502 558.98412232 804.35193291 268.94407091 887.15282648
 451.75258548 394.20486154 278.03300097 324.74026513 274.93954647
 902.56982692 716.39408967 457.17483274 682.95114156 508.13320534
 742.91882617 757.04961826 641.72297801 318.82552816 574.22705554
 900.88207086 576.05400667 427.06671223 660.43186358 463.85939332
 475.78613366 542.17139577 367.36438069 186.48893823 829.14251143
 435.38625741 290.24985266 704.57773243 406.8579228

In [18]:
model_raw

In [19]:
# 2. Run with PCA-reduced data (raw, unscaled data)
pca_raw = PCA(n_components=0.95)
X_train_pca_raw = pca_raw.fit_transform(X_train)
X_test_pca_raw = pca_raw.transform(X_test)
model_pca_raw, y_pred_pca_raw, mse_pca_raw, r2_pca_raw, coef_pca_raw, intercept_pca_raw = train_and_evaluate(X_train_pca_raw, y_train, X_test_pca_raw, y_test)
summarize_model(model_pca_raw, y_pred_pca_raw, mse_pca_raw, r2_pca_raw, coef_pca_raw, intercept_pca_raw)

Model Summary:
Mean Squared Error: 2352.8953326441733
R² Score: 0.9388395312323444
Intercept: 554.3561940298507
Coefficients: [-3.43649124  0.05577442]
Predictions: [699.78608301 678.24360012 843.44732185 522.84126252 375.79848362
 430.98773225 722.75583977 421.22805388 374.18220445 459.82071242
 705.62827842 529.78890359 430.94013127 507.43729582 716.40892674
 460.26461391 208.76431853 626.04782392 513.06579966 319.1373351
 874.82055771 243.20014629 822.18931432 370.91171994 308.93871897
 446.13434891 575.38707636 835.4334678  238.24500076 871.62086867
 439.80317639 376.81023284 248.14993454 341.95030725 285.74410338
 884.32011975 729.62780207 486.93133088 699.18707774 490.88954585
 780.06601541 753.03608516 625.08317145 334.38077713 538.66780916
 868.87028752 592.76448321 406.99207825 656.22695667 427.44482437
 476.18067044 574.19863508 352.98634981 216.96288443 845.41514417
 440.00247175 323.27442477 720.00758014 403.53868026 854.23946121
 480.29919695 635.05251672 848.39338379 893.

In [20]:
model_pca_raw

In [21]:
# 3. Run with scaled data
model_scaled, y_pred_scaled, mse_scaled, r2_scaled, coef_scaled, intercept_scaled = train_and_evaluate(X_train_scaled, y_train, X_test_scaled, y_test)
summarize_model(model_scaled, y_pred_scaled, mse_scaled, r2_scaled, coef_scaled, intercept_scaled)

Model Summary:
Mean Squared Error: 2075.226230461454
R² Score: 0.9460571801503286
Intercept: 98.35985521189889
Coefficients: [ 3934.26831544 -5015.91978608    -5.87354869  1974.398277
   -64.63635948]
Predictions: [667.68207744 690.30413749 814.4505887  496.63227608 362.59871914
 424.09446521 751.93867749 412.66614762 375.86612939 457.06954261
 733.46434236 535.79967844 423.80766458 540.01515685 686.70833058
 458.1650717  235.79039868 597.11701801 487.35394171 326.84761875
 878.80024261 213.61038803 789.21715001 342.000966   317.54066165
 457.59415502 558.98412232 804.35193291 268.94407091 887.15282648
 451.75258548 394.20486154 278.03300097 324.74026513 274.93954647
 902.56982692 716.39408967 457.17483274 682.95114156 508.13320534
 742.91882617 757.04961826 641.72297801 318.82552816 574.22705554
 900.88207086 576.05400667 427.06671223 660.43186358 463.85939332
 475.78613366 542.17139577 367.36438069 186.48893823 829.14251143
 435.38625741 290.24985266 704.57773243 406.85792285 826.290

In [22]:
model_scaled

In [23]:
# 4. Run with scaled and PCA-reduced data
model_pca_scaled, y_pred_pca_scaled, mse_pca_scaled, r2_pca_scaled, coef_pca_scaled, intercept_pca_scaled = train_and_evaluate(X_train_pca, y_train, X_test_pca, y_test)
summarize_model(model_pca_scaled, y_pred_pca_scaled, mse_pca_scaled, r2_pca_scaled, coef_pca_scaled, intercept_pca_scaled)

Model Summary:
Mean Squared Error: 2142.8877975335804
R² Score: 0.9442984052901502
Intercept: 554.3561940298507
Coefficients: [ 415.94770736 -159.7888672   -12.21122076]
Predictions: [678.60018702 713.80460821 812.5440328  478.54627901 358.99703874
 394.74840216 752.81640292 388.20285252 373.65006797 471.86815231
 742.3607535  516.64655083 395.28155787 533.64647877 690.93175969
 472.01576655 260.03312719 586.36193911 476.15436654 296.79579761
 862.21354946 218.51376698 797.15644723 340.98599765 291.1907454
 478.66574148 574.47419316 804.83442377 277.80853775 871.43903157
 479.04929833 393.54999312 282.77639526 339.93938348 259.98865353
 882.63597378 713.38557682 456.68582835 696.12438978 508.81457587
 772.47098061 743.03902674 633.13991154 337.89052205 550.36202874
 886.47067196 581.63395942 409.28702146 635.68792528 437.500015
 482.57136747 556.98963801 376.33404632 205.41627597 825.52318623
 459.83863772 314.14160011 707.86260354 390.77848549 818.70860789
 469.86305999 592.56616181 8

In [24]:
model_pca_scaled

In [25]:
print(f"Raw Data - MSE: {mse_raw}, R²: {r2_raw}")
print(f"Raw Data with PCA - MSE: {mse_pca_raw}, R²: {r2_pca_raw}")
print(f"Scaled Data - MSE: {mse_scaled}, R²: {r2_scaled}")
print(f"Scaled + PCA Data - MSE: {mse_pca_scaled}, R²: {r2_pca_scaled}")

Raw Data - MSE: 2075.226230461456, R²: 0.9460571801503285
Raw Data with PCA - MSE: 2352.8953326441733, R²: 0.9388395312323444
Scaled Data - MSE: 2075.226230461454, R²: 0.9460571801503286
Scaled + PCA Data - MSE: 2142.8877975335804, R²: 0.9442984052901502


# II. Insighting

## 1. Compare the performance of the three models

### (a) Assess the performance of the linear regression models trained on three different sets of features:

• raw input features (without any preprocessing).

• scaled input features (standardized but without PCA).

• scaled input features with PCA applied (dimensionality reduced).

The raw input data performed the best, with MSE and R2 scores at  approximately 2075.226 and 0.946 respectively.

Then, there was a slight drop in performance with the scaled and PCA-reduced data. The approximate MSE and R2 score are 2142.888 and 0.944 respectively.

Lastly, the PCA-reduced data without scaling performed the worst, with an MSE and R2 score of approximately 2352.895 and 0.939 respectively.

### (b) Compare the models based on the MSEs and coefficients of determination on the test set.

In addition to our analysis above, the scaled data (without PCA) was able to produce approximately identical results with the raw data. Its MSE is 2075.226 and its R2 score is 0.946.

Based on part 1a of our insights, raw and scaled data without PCA produce the best performance in terms of MSE. They generate the lowest MSE out of the other models. Then, given that PCA-reduced data without scaling has the highest MSE, this model performs the worst in terms of MSE.

Now, in terms of R2, raw and scaled data without PCA again perform best, with both scores being equal and closest to 1 out of all the models. The PCA-reduced data without scaling also performed worst, producing the lowest R2 score out of all the models. Note as well that the PCA-reduced datasets (both scaled and unscaled) produced lower R2 scores than the non-reduced datasets.


In [28]:
print(f"Raw Data - MSE: {mse_raw}, R²: {r2_raw}")
print(f"Scaled Data - MSE: {mse_scaled}, R²: {r2_scaled}")
print(f"Scaled + PCA Data - MSE: {mse_pca_scaled}, R²: {r2_pca_scaled}")

Raw Data - MSE: 2075.226230461456, R²: 0.9460571801503285
Scaled Data - MSE: 2075.226230461454, R²: 0.9460571801503286
Scaled + PCA Data - MSE: 2142.8877975335804, R²: 0.9442984052901502


## 2. Analyze the impact of scaling and dimensionality reduction

### Examine the differences in MSE and regression score across the three models

• Scaling for PCA: Explain why scaling may be necessary before applying PCA.

PCA works under the assumption that the data is normally distributed. If the features have different scales (for instance, they are expressed in terms of different units), the algorithm might erroneously weigh the feature/s with larger values. Scaling is necessary to ensure that the features are normalized to the same scale. This then ensures optimal dimensionality reduction.

• Scaling for linear regression: Discuss why scaling is important before applying linear regression.

Although scaling is not necessarily required for linear regression, it ensures that each feature has a comparable contribution to the model. That is, features with larger magnitudes do not dominate the model and decrease the algorithm's stability.


(a) Scaling for PCA:
Scaling before applying PCA is necessary because PCA is a variance-based method. If the data has features with different scales (e.g., floor area vs. number of rooms), features with larger values dominate the PCA results. Standardizing the data ensures all features contribute equally to the variance calculation.

- Without scaling: Features with larger ranges dominate the variance, leading to biased principal components.
- With scaling: All features are on the same scale, ensuring that PCA captures meaningful relationships in the data.

(b) Scaling for Linear Regression:
Scaling is important before applying linear regression because the model's coefficients are sensitive to the magnitudes of the input features. If features have vastly different scales (e.g., floor area in hundreds vs. number of rooms as single digits), the coefficients might not be appropriately weighted.

- Without scaling: Large values (like floor area) might result in larger coefficients, making it hard to interpret the effect of smaller-scale features (like number of rooms).
- With scaling: Linear regression coefficients become more interpretable, and the model tends to perform better in terms of convergence and accuracy.

## 3. Identify Key Features in Principal Components


In [40]:
print("PCA Components (Loadings):")
print(pca_model.components_)

PCA Components (Loadings):
[[ 0.57620976  0.49050036 -0.00408638  0.32711504  0.56601303]
 [-0.31370227  0.1366389   0.10538495  0.88153259 -0.30775802]
 [-0.03492677  0.01285235 -0.99442234  0.0919297  -0.03588982]]


(b) Real-World Significance:
Let’s analyze how these features align with expectations:

- Floor area: Houses with larger floor areas generally have higher prices, which makes sense that this feature would heavily influence the first principal component.
- Distance to city center: This feature typically has a negative correlation with house prices (further distance = lower price), so it is also expected to play a significant role in principal components.