# Table of Contents

* [Introduction](#Introduction)
* [Importing Dataset](#Import)
* [Data set changes](#df_changes)
* [Importing model](#model_import)
* [Model Exectution](#model_perform)

## Introduction

This is a guide to impute the model with a raw dataset.

### Importing data set

In [15]:
# Import needed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# package to import model
import pickle

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

## Importing dataset

In [16]:
import pandas as pd

df = pd.read_csv('../data/data_raw.csv')

## Data set changes

In [17]:
def df_cleaning(df):

    #converting median_income to a dollar unit instead of a ten thousand dollar unit
    ten_thousand_dollar_unit= 10000
    df.median_income= df.median_income * ten_thousand_dollar_unit

    # interpolate missing values
    df= df.interpolate()

    # dropping categorical ocean_proximity and geospatial features for outliers treatment
    X = df.drop(['longitude', 'latitude', 'ocean_proximity'], axis=1)

    # Computing 10th, 90th percentiles for each feature
    for col in X.columns:
        percentiles = X[col].quantile([0.10, 0.90]).values
        X[col] = np.clip(X[col], percentiles[0], percentiles[1])

    # Including dropped features for dataset extraction
    df = X.join(df[['longitude', 'latitude', 'ocean_proximity']])

    # encoding
    # Using get_dummies() pandas method to return a dataframe with ocean_proximity instances as dummy variables.
    dummy_ocn_prx = pd.get_dummies(df.ocean_proximity)

    # dropping unneeded ocean_proximity raw feature
    df = df.drop(['ocean_proximity'], axis=1)

    # merging encoded feature instances into the scaled dataframe
    df = pd.merge(
    left=df,
    right=dummy_ocn_prx,
    left_index=True,
    right_index=True,
    )

    # Scaling
    X = df.drop(['median_house_value', '<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], axis=1)

    X_scaled = pd.DataFrame(StandardScaler().fit_transform(X),columns = X.columns)

    # merging encoded feature instances into the scaled dataframe
    X_scaled = pd.merge(
        left=X_scaled,
        right=dummy_ocn_prx,
        left_index=True,
        right_index=True,
    )

    # reinserting target feature into scaled dataset
    X_scaled = pd.merge(
        left=X_scaled,
        right=df.median_house_value,
        left_index=True,
        right_index=True,
    )

    return X_scaled

## Importing model

### Hold-out split

In [18]:
# define X, y
X = df_cleaning(df).drop("median_house_value", axis=1)
y = df_cleaning(df)["median_house_value"]

# Splitting data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Model Execution

### Instantiate model score

In [None]:
model = pickle.load(open('../pickle/model_pickle.pkl','rb'))
model.score(X_test, y_test)

----------------