In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [24]:
df=pd.read_csv('data_boston_housing.csv')

## Step 1: Train  Model With Outliers

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Splitting data into train and test sets
X = df.drop(columns=['medv'])  # Independent variables
y = df['medv']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model performance
mse_before = mean_squared_error(y_test, y_pred)
mae_before = mean_absolute_error(y_test, y_pred)
r2_before = r2_score(y_test, y_pred)

print("Before Removing Outliers:")
print("MSE:", mse_before)
print("MAE:", mae_before)
print("R2 Score:", r2_before)


Before Removing Outliers:
MSE: 24.291119474973478
MAE: 3.1890919658878416
R2 Score: 0.6687594935356326


## Step 2: Remove Outliers Using IQR (Interquartile Range)

In [14]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Selecting features (excluding target variable 'medv')
X = df.drop(columns=['medv'])
y = df['medv']

# Standardization (Z-score)
scaler_std = StandardScaler()
X_std = scaler_std.fit_transform(X)

# Min-Max Scaling (0 to 1)
scaler_minmax = MinMaxScaler()
X_minmax = scaler_minmax.fit_transform(X)

# Print before & after scaling
print("Before Scaling (Original Data):\n", X.head())
print("\nAfter Standardization:\n", X_std[:5])
print("\nAfter Min-Max Scaling:\n", X_minmax[:5])


Before Scaling (Original Data):
       crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  
0  396.90   4.98  
1  396.90   9.14  
2  392.83   4.03  
3  394.63   2.94  
4  396.90   5.33  

After Standardization:
 [[-0.41978194  0.28482986 -1.2879095  -0.27259857 -0.14421743  0.41367189
  -0.12001342  0.1402136  -0.98284286 -0.66660821 -1.45900038  0.44105193
  -1.0755623 ]
 [-0.41733926 -0.48772236 -0.59338101 -0.27259857 -0.74026221  0.19427445
   0.36716642  0.55715988 -0.8678825  -0.98732948 -0.30309415  0.44105193
  -0.49243937]
 [-0.41734159 -0.4877

In [19]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Assume df is your dataset and 'medv' is the target variable
X = df.drop(columns=['medv'])  # Features (dropping target column)
y = df['medv']  # Target variable

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame with original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Print before & after scaling to compare
print("Before Scaling (Original Data):\n", X.head())
print("\nAfter Min-Max Scaling:\n", X_scaled_df.head())

df_cleaned=X_scaled_df


Before Scaling (Original Data):
       crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  
0  396.90   4.98  
1  396.90   9.14  
2  392.83   4.03  
3  394.63   2.94  
4  396.90   5.33  

After Min-Max Scaling:
        crim    zn     indus  chas       nox        rm       age       dis  \
0  0.000000  0.18  0.067815   0.0  0.314815  0.577505  0.641607  0.269203   
1  0.000236  0.00  0.242302   0.0  0.172840  0.547998  0.782698  0.348962   
2  0.000236  0.00  0.242302   0.0  0.172840  0.694386  0.599382  0.348962   
3  0.000293  0.00  0.063050   0.0  0.15

In [20]:
import numpy as np

# Function to remove outliers using IQR
def remove_outliers_iqr(data):
    Q1 = data.quantile(0.25)  # 25th percentile
    Q3 = data.quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1  # Interquartile range

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return data[~((data < lower_bound) | (data > upper_bound)).any(axis=1)]  # Remove outliers

# Apply to dataset
df_cleaned = remove_outliers_iqr(df)

print("Before removing outliers:", df.shape)
print("After removing outliers:", df_cleaned.shape)


Before removing outliers: (506, 14)
After removing outliers: (268, 14)


## Step 3: Train Model Without Outliers

In [21]:
# Splitting cleaned data
X_cleaned = df_cleaned.drop(columns=['medv'])
y_cleaned = df_cleaned['medv']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

# Train model again
model.fit(X_train_c, y_train_c)
y_pred_c = model.predict(X_test_c)

# New performance metrics
mse_after = mean_squared_error(y_test_c, y_pred_c)
mae_after = mean_absolute_error(y_test_c, y_pred_c)
r2_after = r2_score(y_test_c, y_pred_c)

print("After Removing Outliers:")
print("MSE:", mse_after)
print("MAE:", mae_after)
print("R2 Score:", r2_after)


After Removing Outliers:
MSE: 3.7281318839818365
MAE: 1.5151583031541935
R2 Score: 0.7830668323108957


## Step 4: Compare Before vs. After

In [22]:
print("\n📊 Accuracy Comparison:")
print(f"MSE Before: {mse_before:.2f}  |  MSE After: {mse_after:.2f}")
print(f"MAE Before: {mae_before:.2f}  |  MAE After: {mae_after:.2f}")
print(f"R2 Before: {r2_before:.2f}  |  R2 After: {r2_after:.2f}")



📊 Accuracy Comparison:
MSE Before: 24.29  |  MSE After: 3.73
MAE Before: 3.19  |  MAE After: 1.52
R2 Before: 0.67  |  R2 After: 0.78


 ### Conclusion:
 ### If MSE & MAE Decrease, and R² Increases, removing outliers improved the model.

 ### If MSE & MAE Increase, and R² Decreases, outliers helped the model and shouldn’t be removed.