# **Correlation Study**

## Objectives

* Identify variables with most significant correlation to the sale price 
to answer the first business requirement.

## Inputs

* inputs/datasets/cleaned/TrainSet.csv
* inputs/datasets/cleaned/TestSet.csv

## Outputs

* generate a list with variables to engineer

## Crisp-DM

* Data Preparation


---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Load Data

The pipeline has to handle the data cleaning itself, so we load the original dataset.

In [None]:
import pandas as pd
df_sales = pd.read_csv("outputs/datasets/collection/house_prices_records.csv")
print(df_sales.shape)
df_sales.head(5)


## ML Pipeline with all data

### Data Cleaning and Feature Engineering

Here we apply the data cleaning and feature engineering steps outlined in the 
respective notebooks.

In [5]:

from sklearn.pipeline import Pipeline

### Data Cleaning
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import MeanMedianImputer

### Feature Engineering
from feature_engine import transformation as vt
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OrdinalEncoder
from feature_engine.selection import SmartCorrelatedSelection


### Feature Scaling
from sklearn.preprocessing import StandardScaler

### Feature Selection 
from sklearn.feature_selection import SelectFromModel

### ML algorithms
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor



def PipelineOptimization(model):
  pipeline_base = Pipeline([

    ### Data Cleaning 
    
    ("ArbitraryNumberImputer",ArbitraryNumberImputer(arbitrary_number=0,
                                variables = ['MasVnrArea','EnclosedPorch','WoodDeckSF'])),

    ("CategoricalEncoder",CategoricalImputer(imputation_method='missing',fill_value='none',
                                             variables = ['GarageFinish', 'BsmtFinType1', 'BsmtExposure'])),

    ("MeanMedianImputer",MeanMedianImputer(imputation_method='median',
                                           variables = ['BedroomAbvGr', 'GarageYrBlt', 'LotFrontage', '2ndFlrSF'])),

    ### Feature Engineering 
    ("Ordinalencoder", OrdinalEncoder(encoding_method='arbitrary', 
                          variables = ['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']) ),
                          
    ("LogTransformer", vt.LogTransformer(
                         variables = ['1stFlrSF', 'GrLivArea']) ),

    ("PowerTransformer", vt.PowerTransformer(
                         variables = ['2ndFlrSF', 'BsmtFinSF1','BsmtUnfSF', 'OverallQual',
                             'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF','GarageArea',
                             'GarageYrBlt','OverallCond'
                             ])),

    ("YeoJohnsonTransformer", vt.YeoJohnsonTransformer(
                         variables = ['LotArea', 'OpenPorchSF'])),

    ### ("Winsorizer", Winsorizer(capping_method='iqr', tail='both', fold=1.5,
    ###                    variables = ['GrLivArea']) ),

      
    ("SmartCorrelatedSelection",SmartCorrelatedSelection(variables=None, method="spearman", 
                                                        threshold=0.8, selection_method="variance") ),
    ("feat_scaling",StandardScaler() ),

    ("feat_selection",SelectFromModel(model) ),

    ("model",model ),  
     ])

  return pipeline_base



## Create ML Pipeline for Modelling and Hyperparameter Optimization

In [6]:
from sklearn.model_selection import GridSearchCV
import numpy as np

class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model =  PipelineOptimization(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches


Split data into Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(
                                    df_sales.drop(['SalePrice'],axis=1),
                                    df_sales['SalePrice'],
                                    test_size = 0.2,
                                    random_state = 0,
                                    )

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

## Grid Search CV - Sklearn

We will use standard hyperparameters to find most suitable algorithm

In [8]:
models_quick_search = {
    'LinearRegression': LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    'LinearRegression': {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor": {},
}

Quick GridSearch CV - Binary Classifier

In [None]:
from sklearn.metrics import make_scorer, recall_score
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train, scoring =  'r2', n_jobs=-1, cv=5)

Check the results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Now we have identified 'ExtraTreesRegressor' as the best model to use, we will do a more extensive search to find the best hyperparameter configuration. 

In [11]:
models_search = {
    "ExtraTreesRegressor":ExtraTreesRegressor(random_state=0),
}

## Documentation for parameters found at 
## https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html

params_search = {
    "ExtraTreesRegressor": {
        "model__n_estimators": [25,75],
        "model__max_features": [None],
        "model__min_samples_split": [5,8,10],
        "model__min_samples_leaf": [1,5],
        "model__bootstrap": [True],
    }
}


In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5)


Check the results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Now to identiy the parameters producing the best fit.

In [None]:
best_model = grid_search_summary.iloc[0, 0]
best_model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

In [None]:
regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
regressor_pipeline

### Assess feature importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# after data cleaning and feature engineering, the features may have changes
# how many data cleaning and feature engineering steps does your pipeline have?
data_cleaning_feat_eng_steps = 8
columns_after_data_cleaning_feat_eng = (Pipeline(regressor_pipeline.steps[:data_cleaning_feat_eng_steps])
                                        .transform(X_train)
                                        .columns)

best_features = columns_after_data_cleaning_feat_eng[regressor_pipeline['feat_selection'].get_support(
)].to_list()

# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    'Feature': columns_after_data_cleaning_feat_eng[regressor_pipeline['feat_selection'].get_support()],
    'Importance': regressor_pipeline['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)

# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

### Evaluate on Train and Test Sets

In [21]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np


def regression_performance(X_train, y_train, X_test, y_test, pipeline):
    print("Model Evaluation \n")
    print("* Train Set")
    regression_evaluation(X_train, y_train, pipeline)
    print("* Test Set")
    regression_evaluation(X_test, y_test, pipeline)


def regression_evaluation(X, y, pipeline):
    prediction = pipeline.predict(X)
    print('R2 Score:', r2_score(y, prediction).round(3))
    print('Mean Absolute Error:', mean_absolute_error(y, prediction).round(3))
    print('Mean Squared Error:', mean_squared_error(y, prediction).round(3))
    print('Root Mean Squared Error:', np.sqrt(
        mean_squared_error(y, prediction)).round(3))
    print("\n")


def regression_evaluation_plots(X_train, y_train, X_test, y_test, pipeline, alpha_scatter=0.5):
    pred_train = pipeline.predict(X_train)
    pred_test = pipeline.predict(X_test)

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
    sns.scatterplot(x=y_train, y=pred_train, alpha=alpha_scatter, ax=axes[0])
    sns.lineplot(x=y_train, y=y_train, color='red', ax=axes[0])
    axes[0].set_xlabel("Actual")
    axes[0].set_ylabel("Predictions")
    axes[0].set_title("Train Set")

    sns.scatterplot(x=y_test, y=pred_test, alpha=alpha_scatter, ax=axes[1])
    sns.lineplot(x=y_test, y=y_test, color='red', ax=axes[1])
    axes[1].set_xlabel("Actual")
    axes[1].set_ylabel("Predictions")
    axes[1].set_title("Test Set")

    plt.show()

### Evaluate Performance

In [None]:
regression_performance(X_train, y_train, X_test, y_test, regressor_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, regressor_pipeline)

The regressor pipelines have reached the expected performance threshold (0.75 R2 score) for the train and test set.

---

## Push files to the repo

We will generate the following files

* Train set
* Test set
* Modeling pipeline
* features importance plot

---

In [26]:
import joblib
import os

version = 'v1'
file_path = f'outputs/ml_pipeline/predict_saleprice/{version}'

try:
  os.makedirs(name=file_path)
except Exception as e:
  print(e)

### Train Set: features and target

In [None]:
X_train.head()

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train

In [29]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

### Test Set: Features and Target

In [None]:
X_test.head()

In [31]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test

In [33]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

### Modelling pipeline

ML pipeline for predicting the Sale Price of the property

In [None]:
regressor_pipeline

In [None]:
joblib.dump(value=regressor_pipeline, filename=f"{file_path}/regressor_pipeline.pkl")

### Feature importance plot

In [None]:
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

In [None]:
df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')