# Import libraries

In [1]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tpot import TPOTRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve



# Data Fetching

In [13]:
file_path= os.path.join("../data/data_house_market.csv") # Filepath of CSV file

In [14]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,10600,-117.8,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2494,-120.19,36.6,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,4284,-118.32,34.1,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND


# Data Cleaning

In [15]:
#dropping 'Unnamed: 0' column
df = df.drop(['Unnamed: 0'], axis=1)

In [16]:
#converting median_income to a dollar unit instead of a ten thousand dollar unit
ten_thousand_dollar_unit = 10000
df['median_income'] = df.median_income * ten_thousand_dollar_unit

0    14817.0
1    69133.0
2    15536.0
3    15284.0
4    40815.0
Name: median_income, dtype: float64

In [17]:
df = df.interpolate()

In [21]:
# Using get_dummies() pandas method to return a dataframe with ocean_proximity instances as dummy variables.
dummy_ocn_prx = pd.get_dummies(df.ocean_proximity)

       <1H OCEAN  INLAND  ISLAND  NEAR BAY  NEAR OCEAN
0              0       1       0         0           0
1              1       0       0         0           0
2              0       1       0         0           0
3              1       0       0         0           0
4              0       1       0         0           0
...          ...     ...     ...       ...         ...
16507          0       1       0         0           0
16508          0       0       0         1           0
16509          0       1       0         0           0
16510          1       0       0         0           0
16511          0       0       0         0           1

[16512 rows x 5 columns]


In [22]:
# merging encoded feature instances
df = pd.merge(
    left=df,
    right=dummy_ocn_prx,
    left_index=True,
    right_index=True,
    )

In [24]:
df = df.drop('ocean_proximity', axis=1)

# Feature Selections

In [25]:
X = df.drop('median_house_value', axis=1)  # List of features which are required for model training
y = df['median_house_value']  # Target feature for prediction

# ML Modeling

## TPOT Regressor

### Data Splitting

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

In [27]:
pipeline_optimizer = TPOTRegressor(generations=5, population_size=20, cv=5, verbosity=2, scoring='r2')
pipeline_optimizer.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7869124058854415

Generation 2 - Current best internal CV score: 0.7888924158541023

Generation 3 - Current best internal CV score: 0.7888924158541023

Generation 4 - Current best internal CV score: 0.8068810690635541

Generation 5 - Current best internal CV score: 0.8092607835057766

Best pipeline: KNeighborsRegressor(RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.7000000000000001, min_samples_leaf=5, min_samples_split=7, n_estimators=100), n_neighbors=59, p=1, weights=uniform)


TPOTRegressor(generations=5, population_size=20, scoring='r2', verbosity=2)

### Model export

In [28]:
pipeline_optimizer.export('../models/tpot_house_market_prediction.py')

### Model instantiation

### Model performance

In [None]:
# Get train scores, train sizes, and validation scores using `learning_curve`, recall
def learning_curves(model, features, target):
    train_sizes, train_scores, test_scores = learning_curve(estimator = model,
                                                            X = features,
                                                            y = target,
                                                            train_sizes = [5,10,50,100,200,500,1000,2000,3000,5000],
                                                            cv = 5,
                                                            scoring='r2',
                                                            shuffle = True,
                                                            random_state=3)

    # Take the mean of cross-validated train scores and validation scores
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    # Plot the learning curves!
    plt.plot(train_sizes, train_scores_mean, label = 'Training score')
    plt.plot(train_sizes, test_scores_mean, label = 'Test score')
    plt.ylabel('Recall', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    plt.title('Learning curves - log model', fontsize = 18, y = 1.03)
    plt.legend()
    plt.show()
