# Why Sklearn is used?
1. Model Building like LogR,SVC,KNN,RandomForest(Classification) and Regresssion Models like LR,Ridge,SVR and Clustering (KMeans) and Dimensionality Reduction (PCA).
2. Model Evaluation (accuracy, precision, recall, F1 score, ROC AUC, mean squared error, and R² score) also (Cross Validation like cross_val_score, GridSearchCV)
3. Data PreProcessing- (Scaling like Standard Scaler, MinMax Scaler) and (Encoding like OneHotEncoder, LabelEncoder) and Imputation like (SimpleImputer and KNNImputer).
4. Feature Engineering.

In [48]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import  Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

In [49]:
df = pd.read_csv('/kaggle/input/eda-final2/train_with_EDA_final.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,3,3,4,0,...,0,0,0,0,0,2,2008,8,4,208500.0
1,2,20,3,80.0,9600,1,3,3,2,0,...,0,0,0,0,0,5,2007,8,4,181500.0
2,3,60,3,68.0,11250,1,0,3,4,0,...,0,0,0,0,0,9,2008,8,4,223500.0
3,4,70,3,60.0,9550,1,0,3,0,0,...,272,0,0,0,0,2,2006,8,0,140000.0
4,5,60,3,84.0,14260,1,0,3,2,0,...,0,0,0,0,0,12,2008,8,4,250000.0


In [50]:
# Print the types of each column
print(df.dtypes)
df = df.select_dtypes(include=['int64','float64'])

Id                 int64
MSSubClass         int64
MSZoning           int64
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType           int64
SaleCondition      int64
SalePrice        float64
Length: 75, dtype: object


In [51]:
nan_mean = df.isna().mean()
threshold = 0.1
columns_to_drop = nan_mean[nan_mean > 0.5].index
df = df.drop(columns=columns_to_drop)
# df = df.drop(columns=['Id'])
print("\nCleaned DataFrame:")
print(df)


Cleaned DataFrame:
        Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  LotShape  \
0        1          60         3         65.0     8450       1         3   
1        2          20         3         80.0     9600       1         3   
2        3          60         3         68.0    11250       1         0   
3        4          70         3         60.0     9550       1         0   
4        5          60         3         84.0    14260       1         0   
...    ...         ...       ...          ...      ...     ...       ...   
1450  1451          90         3         60.0     9000       1         3   
1451  1452          20         3         78.0     9262       1         3   
1452  1453         180         4         35.0     3675       1         3   
1453  1454          20         3         90.0    17217       1         3   
1454  1455          20         1         62.0     7500       1         3   

      LandContour  LotConfig  LandSlope  ...  EnclosedPorch  3SsnPo

In [52]:
print(df.isnull().sum().sort_values(ascending=False).head(10))

SalePrice        0
Id               0
MSSubClass       0
GarageArea       0
GarageQual       0
GarageCond       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
dtype: int64


In [53]:
# Fill 0 to all Missing Values
df = df.fillna(0)
print(df.isnull().sum().sort_values(ascending=False).head(10))

SalePrice        0
Id               0
MSSubClass       0
GarageArea       0
GarageQual       0
GarageCond       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
dtype: int64


### Splitting the Dataset

In [62]:
X = df.drop('SalePrice',axis=1) # Features
y = df['SalePrice']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,-1.730861,0.071775,-0.04561,-0.222827,-0.206991,0.064349,0.752749,0.315261,0.606089,-0.226123,...,0.21554,-0.359015,-0.116542,-0.270706,-0.06881,-0.085099,-1.600208,0.142015,0.314459,0.208876
1,-1.72848,-0.873315,-0.04561,0.457417,-0.091927,0.064349,0.752749,0.315261,-0.625558,-0.226123,...,-0.704229,-0.359015,-0.116542,-0.270706,-0.06881,-0.085099,-0.490612,-0.612116,0.314459,0.208876
2,-1.726099,0.071775,-0.04561,-0.086778,0.073166,0.064349,-1.375326,0.315261,0.606089,-0.226123,...,-0.070945,-0.359015,-0.116542,-0.270706,-0.06881,-0.085099,0.98885,0.142015,0.314459,0.208876
3,-1.723718,0.308047,-0.04561,-0.449575,-0.096929,0.064349,-1.375326,0.315261,-1.857205,-0.226123,...,-0.176493,4.089302,-0.116542,-0.270706,-0.06881,-0.085099,-1.600208,-1.366248,0.314459,-3.419952
4,-1.721337,0.071775,-0.04561,0.638816,0.374335,0.064349,-1.375326,0.315261,-0.625558,-0.226123,...,0.562339,-0.359015,-0.116542,-0.270706,-0.06881,-0.085099,2.098446,0.142015,0.314459,0.208876


In [65]:
X = X.to_numpy()
y = y.to_numpy()

In [66]:
print(X.shape,y.shape)

(1455, 74) (1455,)


## Splitting of Data
* Use train_test_split from sklearn.model_selection to shuffle and split the features and prices data into training and testing sets.
    *
Split the data into 80% training and 20% testing
    *
Set the random_state for train_test_split to a value of your choi . . This ensures results are consistent.

In [67]:
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 test_size=0.2,
                                                random_state=1)  
# Random State ensures same set every time
# Stractify distribution(propertion of each class) is preserved in train and test sets

In [68]:
# Initializing Classifiers for all algos
reg1 = KNeighborsRegressor(algorithm='ball_tree', leaf_size=50)

reg2 = DecisionTreeRegressor(random_state=1)

reg3 = SVR()

reg4 = RandomForestRegressor(random_state=1)

reg5 = Lasso(fit_intercept=True, max_iter=5000)

reg6 = Ridge()

In [69]:
# Building the pipelines for streamline the process
pipe1 = Pipeline([('std', StandardScaler()),
                 ('reg1',reg1)])
pipe2 = Pipeline([('std', StandardScaler()),
                 ('reg2',reg2)])
pipe3 = Pipeline([('std', StandardScaler()),
                 ('reg3',reg3)])
pipe4 = Pipeline([('std', StandardScaler()),
                 ('reg4',reg4)])
pipe5 = Pipeline([('std', StandardScaler()),
                 ('reg5',reg5)])
pipe6 = Pipeline([('std', StandardScaler()),
                 ('reg6',reg6)])

In [70]:
# Setting up the Parameter grids

param_grid1 = [{'reg1__n_neighbors': list(range(1, 10)),
                'reg1__p': [1, 2]}]

param_grid2 = [{'reg2__max_depth': list(range(1, 10)) + [None],
                'reg2__criterion': ['squared_error', 'absolute_error']}]

param_grid3 = [
    {
        'reg3__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'reg3__C': [1, 5, 10],
        'reg3__degree': [3, 8],  # Only for 'poly'
        'reg3__coef0': [0.01, 10, 0.5],  # Only for 'poly' and 'sigmoid'
        'reg3__gamma': ['auto', 'scale']  # Only for 'rbf', 'poly', and 'sigmoid'
    }
]
param_grid4 = [{'reg4__n_estimators': [10, 100, 500, 1000, 10000]}]

param_grid5 = [{'reg5__alpha':[0.001, 0.01, 0.1, 1, 10]}]

param_grid6 = [{'reg6__alpha':[0.001, 0.01, 0.1, 1, 10]}]

### Define a Performance Metric
 For this project, you will be calculating the coefficient of determination, R2, to quantify your model's performance. The coefficient of determination for a model is a useful statistic in regression analysis, as it often describes how "good" that model is at making predictions.

The values for R2 range from 0 to 1, which captures the percentage of squared correlation between the predicted and actual values of the target variable. A model with an R2 of 0 is no better than a model that always predicts the mean of the target variable, whereas a model with an R2 of 1 perfectly predicts the target variable. Any value between 0 and 1 indicates what percentage of the target variable, using this model, can be explained by the features. A model can be given a negative R2 as well, which indicates that the model is arbitrarily worse than one that always predicts the mean of the target variable.

In [71]:
# Initialising list for param_grid, pipelines and names
param_grids = [param_grid1, param_grid2, param_grid3, param_grid4, param_grid5, param_grid6]
pipelines = [pipe1, pipe2, pipe3, pipe4, pipe5, pipe6]
names = ['KNN', 'DTree', 'SVR', 'RForest', 'Lasso', 'Ridge']

# Setup GridSearchCV objects
gridcvs = {}
inner_cv = KFold(n_splits = 2, shuffle = True, random_state=1)

for pgrid, est, name in zip(param_grids,pipelines,names):
    gcv = GridSearchCV(
        estimator = est,
        param_grid= pgrid,
        # scoring='r2',     # GridSearch is designed to max the scoring metric while mse is used for minimize
        n_jobs = -1,
        cv = inner_cv,
        verbose = 0,
        refit = True
    )
    gridcvs[name] = gcv

In [77]:
def nested_cv(X_train,y_train):
    for name,gs_est in sorted(gridcvs.items()):
        print(50*'-','\n')
        print('Algorithm:',name)
        print('       Inner Loop:')

        outer_scores = []
        outer_cv = KFold(n_splits = 5, shuffle = True,random_state = 1)

        for train_idx, valid_idx in outer_cv.split(X_train,y_train):
            # Run inner loop
            gridcvs[name].fit(X_train[train_idx],y_train[train_idx])
            print('\n        Best R2 (avg. of inner test folds): %.2f' % (gs_est.best_score_))
            print('        Best parameters:', gs_est.best_params_)

            # Performance on test fold (valid_idx)
            outer_scores.append(gs_est.best_estimator_.score(X_train[valid_idx],y_train[valid_idx]))
            print('               R2 (on outer test fold) %.2f' % (outer_scores[-1]))

        print('\n       Outer Loop :')
        print('           Mean R2: %.2f +/- %.2f' % (np.mean(outer_scores), np.std(outer_scores)))

In [None]:
# Nested CV Without Principal Component Analysis
nested_cv(X_train,y_train)

## Nested CV with Principal Component Analysis

## Conclusion
- Random Forest emerged as the most reliable and accurate model for this regression task.
- **Reasons:** The R² value of 0.83 indicates that 83% of the variance in the target variable is captured by the model.
## HyperParameter Tuning for Random Forest

In [75]:
gcv_hyperparameter_tuning = GridSearchCV(estimator=RandomForestRegressor(random_state=1),
                                        param_grid= [{'n_estimators': [10, 100, 500, 1000, 10000]}],
                                        n_jobs=-1,
                                        cv = inner_cv,
                                        verbose=1,
                                        refit=True)
gcv_hyperparameter_tuning.fit(X_train,y_train)
print('Best CV R2: %.2f%%' % (gcv_hyperparameter_tuning.best_score_))
print('Best Parameters:',gcv_hyperparameter_tuning.best_params_)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
Best CV R2: 0.83%
Best Parameters: {'n_estimators': 10000}


### Question - Best HyperParameter for RandomForestRegressor
- Which maximum depth do you think results in a model that best generalizes to unseen data?
### Answer -
- Random Forest with **n_estimators = 1000** gives best r2 score on validation dataset in comparision output. The training score is around 0.80 and close to the validation score which hints us that the model is generalizing the data well.

# Evaluation and Performance Results

In [76]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

final_model = RandomForestRegressor(random_state=1,n_estimators=1000)
final_model.fit(X_train,y_train)
# Predicting on the training data
y_train_pred = final_model.predict(X_train)

# Predicting on the test data
y_test_pred = final_model.predict(X_test)

# Calculating metrics for the training data
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

# Calculating metrics for the test data
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print('Training Metrics:')
print('Mean Squared Error (MSE): %.2f' % mse_train)
print('Root Mean Squared Error (RMSE): %.2f' % rmse_train)
print('Mean Absolute Error (MAE): %.2f' % mae_train)
print('R² Score: %.2f' % r2_train)

print('\nTest Metrics:')
print('Mean Squared Error (MSE): %.2f' % mse_test)
print('Root Mean Squared Error (RMSE): %.2f' % rmse_test)
print('Mean Absolute Error (MAE): %.2f' % mae_test)
print('R² Score: %.2f' % r2_test)

Training Metrics:
Mean Squared Error (MSE): 126401019.01
Root Mean Squared Error (RMSE): 11242.82
Mean Absolute Error (MAE): 6563.37
R² Score: 0.98

Test Metrics:
Mean Squared Error (MSE): 640832799.01
Root Mean Squared Error (RMSE): 25314.68
Mean Absolute Error (MAE): 17077.21
R² Score: 0.89
