# Installation

In [1]:
! pip install pandas numpy scikit-learn xgboost




[notice] A new release of pip available: 22.2.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Import the libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

# Read the datasets

In [3]:
preprocessed_data = pd.read_csv('../data/processed_data/feature_engineered_data.csv')
preprocessed_data.shape

(1460, 190)

In [4]:
preprocessed_data.dtypes

Unnamed: 0         int64
OverallQual        int64
GrLivArea          int64
GarageCars         int64
GarageArea         int64
                   ...  
BsmtCond_TA         bool
LandContour_Bnk     bool
LandContour_HLS     bool
LandContour_Low     bool
LandContour_Lvl     bool
Length: 190, dtype: object

In [5]:
#Standerdize
def scalar(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [6]:
# Perform PCA for dimensionality reduction
def apply_pca(xt,n):
    pca = PCA(n_components=n)
    X_pca = pca.fit_transform(xt)
    return X_pca

In [7]:
# Cross validation
def cross_validate (model , model_name, X_train, y_train) :
    clf = make_pipeline(StandardScaler(), model_name)
    mse_list = cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    mse_list_positive = -mse_list
    # using numpy sqrt function to calculate rmse:
    rmse_list = np.sqrt(mse_list_positive)
    print (f"Cross Val Score with 5 Folds for {model} are {rmse_list}")

## Model Training and Predictions

### Grid Search for Parameter Tuning

In [8]:
def model_training_predictions (X_train, X_test, y_train, y_test, Grid_Search = False, Cross_Validate = False) :

    results_df = pd.DataFrame()

    
    model_list = ['Decision Tree','Linear Regression','XgBoost', 'Random Forest', 'Ridge', 'Lasso' ]
    results_df = pd.DataFrame()
    for model in model_list:
        #Model Training and parameter tuning using grid search
        if model == "XgBoost":
            model_name = xgb.XGBRegressor()
            grid_values = {
            'n_estimators': [100, 200],  # Testing two values for n_estimators
            'max_depth': [3, 6],  # Testing two values for max_depth
            'learning_rate': [0.1, 0.01]  # Testing two values for learning_rate
             }
            if Grid_Search:
                print ("XgBoost Training with Grid Search...........")    
                model_name = GridSearchCV(model_name, param_grid = grid_values)
                model_name.fit(X_train, y_train)

            elif Cross_Validate : 
                print ("XgBoost Model Training...........")
                cross_validate(model, model_name, X_train, y_train)

        elif model == "Random Forest" :
            model_name = RandomForestRegressor()
            grid_values = {
                'n_estimators': [100, 200],  # Testing two values for n_estimators
                'max_depth': [None, 5],  # Testing two values for max_depth
                'min_samples_split': [2, 5],  # Testing two values for min_samples_split
            }
            if Grid_Search:
                print ("Random Forest Training with Grid Search...........")    
                model_name = GridSearchCV(model_name, param_grid = grid_values)
                model_name.fit(X_train, y_train)


            elif Cross_Validate : 
                print ("Random Forest Model Training...........")
                cross_validate(model, model_name, X_train, y_train)
                

        elif model == "Ridge" : 
            model_name = Ridge()
            grid_values = {
            'alpha': [0.001, 1],  # Testing two alpha values
            'fit_intercept': [True, False],  # Testing two normalization options
            'max_iter': [1000, 5000]  # Testing two values for max_iter 
            }

            if Grid_Search:
                print ("Ridge Model Training with Grid Search...........")    
                model_name = GridSearchCV(model_name, param_grid = grid_values)
                model_name.fit(X_train, y_train)

            elif Cross_Validate : 
                print ("Ridge Training...........")
                cross_validate(model, model_name, X_train, y_train)


        elif model == "Lasso":    
            model_name = Lasso()
            grid_values = {
            'alpha': [0.001, 1],  # Testing two alpha values
            'fit_intercept': [True, False],  # Testing two normalization options
            'max_iter': [1000, 5000]  # Testing two values for max_iter 
            }

            if Grid_Search:
                print ("Lasso Model Training with Grid Search...........")    
                model_name = GridSearchCV(model_name, param_grid = grid_values)
                model_name.fit(X_train, y_train)

            elif Cross_Validate :
                print ("Lasso Model Training...........")
                cross_validate(model, model_name, X_train, y_train)

            
            model_name.fit(X_train, y_train)

        elif model == "Decision Tree":    
            model_name = DecisionTreeRegressor()
            grid_values = {'max_depth': [5, 10], 'min_samples_split':[2, 10], 'max_leaf_nodes': [50, None]}
            if Grid_Search:
                print ("Decision Tree Model Training with Grid Search...........")    
                model_name = GridSearchCV(model_name, param_grid = grid_values)
                model_name.fit(X_train, y_train)

            elif Cross_Validate :
                cross_validate(model, model_name, X_train, y_train)        

        elif model == "Linear Regression":    
            model_name = LinearRegression()
            grid_values = {
                'n_jobs': [-1, 2],  # Testing two normalization options
                'fit_intercept': [True, False]  # Testing two fit_intercept options
            }
            if Grid_Search:
                print ("Linear Regression Model Training with Grid Search...........")    
                model_name = GridSearchCV(model_name, param_grid = grid_values)
                model_name.fit(X_train, y_train)

            elif Cross_Validate :
                print ("Linear Regression Model Training...........")
                cross_validate(model, model_name, X_train, y_train)        

        if Grid_Search :
            
            predictions = model_name.predict(X_test)

            data = {'Actual': list(y_test), 'Predicted': predictions, 'Model': model} 
            model_results = pd.DataFrame(data)

            results_df = pd.concat([results_df, model_results], ignore_index = True)

    if Grid_Search:
        return (results_df)



  

## Call the functions

In [9]:
df = preprocessed_data

drop_cols = ['SalePrice']
traincols = df.columns
feature_cols  = list(set(traincols)-set(drop_cols))
label_col = 'SalePrice'


X = df[feature_cols]
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_test = scalar(X_train, X_test )
X_train = apply_pca(X_train,100)
X_test = apply_pca(X_test,100)
predictions = model_training_predictions(X_train, X_test, y_train, y_test, Grid_Search =  True)
predictions.to_csv("../data/predictions/model_predictions.csv", index=False)

Decision Tree Model Training with Grid Search...........
Linear Regression Model Training with Grid Search...........
XgBoost Training with Grid Search...........
Random Forest Training with Grid Search...........
Ridge Model Training with Grid Search...........


ValueError: Invalid parameter 'normalize' for estimator Ridge(alpha=0.001, max_iter=1000). Valid parameters are: ['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'positive', 'random_state', 'solver', 'tol'].

## Cross Validation

In [None]:
model_training_predictions(X_train, X_test, y_train, y_test, Cross_Validate=True)