In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)

from prepare_2 import wrangle_data, split_data

random_state = 42

In [2]:
df, sale_df, rent_df = wrangle_data()

In [3]:
target = 'newconstructionyn'
positive = 1

## Prep for Modeling

### Drop columns not used in modeling

In [4]:
cols_to_drop = [
                'address_id',               # unique identifier not useful
                'listingcontractdate',      # we'll use engineered date features instead
                'originallistprice',         # we'll use the scaled prices instead
                'originallistprice_persqft', # we'll use the scaled prices instead
                ]

df = df.drop(columns=cols_to_drop)

### Encode categorical variables

In [5]:
def encode_data(df):
    '''
    This function takes in our dataset and encodes a given set of 
    categorical features using pandas one-hot encoder. It drops
    the original un-encoded columns and returns the df. 
    '''
    # categorical variables (that aren't already binary True/False)
    cols_to_encode = [
                      'propertytype', 
                      'propertysubtype', 
                     ]
    
    # create encoded column for each feature
    for col in cols_to_encode:
        dummy_df = pd.get_dummies(df[col],
                                  prefix=df[col].name,
                                  drop_first=True,
                                  dummy_na=False)
        # add encoded column to df
        df = pd.concat([df, dummy_df], axis=1)
        # drop original column
        df = df.drop(columns=col)
        
    return df

df = encode_data(df)

### Turn boolean columns into 1/0

In [6]:
def convert_bools(df):
    '''
    This function takes in our dataset and converts all boolean columns to 1 or 0
    numeric datatypes, then returns the df.
    '''
    # identify boolean columns
    bools = [col for col in df.columns if df[col].dtype == 'bool']
    # convert to 1 or 0
    for col in bools:
        df[col] = df[col].map({True: 1, False: 0})
    return df

df = convert_bools(df)

### Split the Data: Train & Test

In [7]:
train, test = split_data(df)

### Scale quantitative variables


In [8]:
def scale_data(train, test, scaler_type=MinMaxScaler()):
    '''
    This takes in the train and test dataframes. 

    It then fits a scaler object to the train sample based on the given sample_type, applies that
    scaler to the trainand test samples, and appends the new scaled data to the 
    dataframes as additional columns with the prefix 'scaled_'. 

    train and test dataframes are returned, in that order. 
    '''
    # identify quantitative features to scale (that aren't already scaled)
    cols_to_scale = [
                     'lotsizearea', 
                     'bedroomstotal', 
                     'bathroomstotalinteger',
                     'bathroomsfull',
                     'bathroomshalf', 
                     'livingarea',
                     'stories', 
                     'yearbuilt',
                     'years_since_build', 
                     'garage_size', 
                     'central_cooling_units', 
                     'windowwall_cooling_units',
                     'listing_month',
                     'listing_dayofmonth', 
                     'listing_dayofweek'
                    ]
    
    # establish empty dataframes for storing scaled dataset
    train_scaled = pd.DataFrame(index=train.index)
    test_scaled = pd.DataFrame(index=test.index)
    
    # screate and fit the scaler
    scaler = scaler_type.fit(train[cols_to_scale])
    
    # adding scaled features to scaled dataframes
    train_scaled[cols_to_scale] = scaler.transform(train[cols_to_scale])
    test_scaled[cols_to_scale] = scaler.transform(test[cols_to_scale])
    
    # add 'scaled' prefix to columns
    for feature in cols_to_scale:
        train_scaled = train_scaled.rename(columns={feature: f'scaled_{feature}'})
        test_scaled = test_scaled.rename(columns={feature: f'scaled_{feature}'})
        
    # concat scaled feature columns to original train and test df's
    train = pd.concat([train, train_scaled], axis=1)
    test = pd.concat([test, test_scaled], axis=1)
    
    # drop the original columns
    train = train.drop(columns=cols_to_scale)
    test = test.drop(columns=cols_to_scale)

    return train, test

train, test = scale_data(train, test)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3002 entries, 1931 to 1859
Data columns (total 66 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   garageyn                                3002 non-null   int64  
 1   newconstructionyn                       3002 non-null   int64  
 2   listed_on_weekend                       3002 non-null   int64  
 3   previously_listed                       3002 non-null   int64  
 4   built_last_two_years                    3002 non-null   int64  
 5   parkingfeatures_attached                3002 non-null   int64  
 6   parkingfeatures_detached                3002 non-null   int64  
 7   parkingfeatures_oversized               3002 non-null   int64  
 8   parkingfeatures_converted               3002 non-null   int64  
 9   parkingfeatures_sideentry               3002 non-null   int64  
 10  parkingfeatures_rearentry               3002 non-null   i

## Baseline Predictions

#### A simple baseline - predicting the most common class

Since the majority of properties are not new construction, our simplest baseline would be to predict 0 for each property. 

In [10]:
def run_baseline_1(train,
                   target,
                   positive,
                   model_number,
                   model_info,
                   model_results):
    '''
    This function takes in the train sample, the target variable label, the positive condition label,
    an initialized model_number variable, as well as model_info and model_results dataframes dataframes that will be used for 
    storing information about the models. It then performs the operations necessary for making baseline predictions
    on our dataset, and stores information about our baseline model in the model_info and model_results dataframes. 
    (i.e. predicts the most common class)
    The model_number, model_info, and model_results variables are returned (in that order). 
    '''

    # separate each sample into x (features) and y (target)
    x_train = train.drop(columns=target)
    y_train = train[target]


    # store baseline metrics

    # identify model number
    model_number = 'baseline_1'
    #identify model type
    model_type = 'baseline_1'

    # store info about the model

    # create a dictionary containing model number and model type
    dct = {'model_number': model_number,
           'model_type': model_type}
    # append that dictionary to the model_info dataframe
    model_info = model_info.append(dct, ignore_index=True)

    # establish baseline predictions for train sample
    y_pred = pd.Series([train[target].mode()[0]]).repeat(len(train))

    # get metrics

    # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
    dct = {'model_number': model_number, 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    # set the model number to from 'baseline' to 0 
    model_number = 0
    
    return model_number, model_info, model_results

In [11]:
# setting up the infrastructure to store information about our models
model_number = 0
model_info = pd.DataFrame()
model_results = pd.DataFrame()

In [12]:
model_number, model_info, model_results = run_baseline_1(train, target, positive,
                                                         model_number, model_info, model_results)

#### An alternative baseline - predicting the positive class

However, if our goal is to maximize recall, i.e. identify as many new construction as possible, a more reasonable baseline might be to treat all properties as if they are new construction (i.e. always predict 1).

In [13]:
def run_baseline_2(train,
                   target,
                   positive,
                   model_number,
                   model_info,
                   model_results):
    '''
    This function takes in the train sample, the target variable label, the positive condition label,
    an initialized model_number variable, as well as model_info and model_results dataframes dataframes that will be used for 
    storing information about the models. It then performs the operations necessary for making baseline predictions
    on our dataset, and stores information about our baseline model in the model_info and model_results dataframes. 
    The model_number, model_info, and model_results variables are returned (in that order). 
    
    For this alternative baseline, we will maximize recall by always predicting 1.
    '''

    # separate each sample into x (features) and y (target)
    x_train = train.drop(columns=target)
    y_train = train[target]

    # store baseline metrics

    # identify model number
    model_number = 'baseline_2'
    #identify model type
    model_type = 'baseline_2'

    # store info about the model

    # create a dictionary containing model number and model type
    dct = {'model_number': model_number,
           'model_type': model_type}
    # append that dictionary to the model_info dataframe
    model_info = model_info.append(dct, ignore_index=True)

    # establish baseline predictions for train sample
    y_pred = pd.Series(1).repeat(len(train))

    # get metrics

    # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
    dct = {'model_number': model_number, 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    # set the model number to from 'baseline' to 0 
    model_number = 0
    
    return model_number, model_info, model_results

In [14]:
model_number, model_info, model_results = run_baseline_2(train, target, positive,
                                                         model_number, model_info, model_results)

In [15]:
model_results

Unnamed: 0,model_number,metric_type,score
0,baseline_1,accuracy,0.792472
1,baseline_1,precision,0.0
2,baseline_1,recall,0.0
3,baseline_1,f1_score,0.0
4,baseline_2,accuracy,0.207528
5,baseline_2,precision,0.207528
6,baseline_2,recall,1.0
7,baseline_2,f1_score,0.343724


## An Intuitive Baseline Model

#### Using the most intuitive way of determining new construction as a more effective baseline

Exploration found that our most reliable feature is whether the build year of the property is within two calendar years of the listing date. This is also very intuitive - any reasonable person looking at a listing and trying to guess whether it was new construction would look for a recent build year. If our eventual model cannot predict more reliably than that, it will not be of much use. 

In [16]:
def run_model_1(train,
                target,
                positive,
                model_number,
                model_info,
                model_results):
    '''
    This function predicts whether a property is new construction based only on whether the build year is within
    two calendar years of the listing. This will create a more effective and useful baseline for which to compare
    future, more complex models. 
    
    This function takes in the train sample, the target variable label, the positive condition label,
    as well as the model_number variable and model_info and model_results dataframes. It then updates and returns
    the model_number, model_info, and model_results variables after creating and storing info about the model 
    described above.
    '''

    # separate each sample into x (features) and y (target)
    x_train = train.drop(columns=target)
    y_train = train[target]

    # identify model number
    model_number +=1
    #identify model type
    model_type = 'simple build year'

    # store info about the model

    # create a dictionary containing model number and model type
    dct = {'model_number': model_number,
           'model_type': model_type}
    # append that dictionary to the model_info dataframe
    model_info = model_info.append(dct, ignore_index=True)

    # establish predictions for train sample
    y_pred = train.built_last_two_years

    # get metrics

    # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
    dct = {'model_number': model_number, 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)
    
    return model_number, model_info, model_results

In [17]:
model_number, model_info, model_results = run_model_1(train, target, positive,
                                                      model_number, model_info, model_results)

In [18]:
def display_model_results(model_results):
    '''
    This function takes in the model_results dataframe. This is a dataframe in tidy data format 
    containing the following information for each model created in the project:
    - model number
    - metric type (accuracy, precision, recall, f1 score)
    - sample type (train, validate)
    - score (the score for the given metric and sample types)
    The function returns a pivot table of those values for easy comparison of models, metrics, and samples. 
    '''
    # create a pivot table of the model_results dataframe
    # establish columns as the model_number, with index as metric_type, and values as score
    # the aggfunc uses a lambda to return each individual score without any aggregation applied
    return model_results.pivot_table(columns='model_number', 
                                     index=('metric_type'), 
                                     values='score',
                                     aggfunc=lambda x: x)

In [19]:
model_results

Unnamed: 0,model_number,metric_type,score
0,baseline_1,accuracy,0.792472
1,baseline_1,precision,0.0
2,baseline_1,recall,0.0
3,baseline_1,f1_score,0.0
4,baseline_2,accuracy,0.207528
5,baseline_2,precision,0.207528
6,baseline_2,recall,1.0
7,baseline_2,f1_score,0.343724
8,1,accuracy,0.974017
9,1,precision,0.901325


In [20]:
display_model_results(model_results).T

metric_type,accuracy,f1_score,precision,recall
model_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.974017,0.940092,0.901325,0.982343
baseline_1,0.792472,0.0,0.0,0.0
baseline_2,0.207528,0.343724,0.207528,1.0


## ML Modeling

Now I will do some simple machine learning modeling to see if we can improve upon the performance of our basic build-year-only model. 

In [21]:
# first feature set uses only those which appeared to be meaningful during exploration
features1 = ['built_last_two_years',
             'previously_listed',
             'scaled_stories',
             'cooling_windowwall',
             'originallistprice_persqft_scaled'
            ]

feature_sets = [features1]

### Decision Tree

In [22]:
def decision_tree(train,
                  target, 
                  positive,
                  feature_sets,
                  model_number, 
                  model_info, 
                  model_results):

    # establish hyperparameter ranges
    min_max_depth = 3
    max_max_depth = 10

    # establish loops based on feature sets and hyperparameter ranges
    count = 1
    for features in feature_sets: 
        for max_depth in range(min_max_depth, max_max_depth + 1):

            # cache completed model info / model results
            model_info.to_csv('model_info.csv')
            model_results.to_csv('model_results.csv')

            ##################
            ### Model Info ###
            ##################

            # create a new model number by adding 1 to the previous model number
            model_number += 1
            # establish the model type
            model_type = 'decision tree'

            # store info about the model

            # create a dictionary containing the features and hyperparamters used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'features': features,
                   'max_depth': max_depth}
            # append that dictionary to the model_info dataframe
            model_info = model_info.append(dct, ignore_index=True)

            ################
            ### Modeling ###
            ################

            # separate each sample into x (features) and y (target)
            x_train = train[features]
            y_train= train[target]


            # create the classifer

            # establish a decision tree classifier with the given max depth
            # set a random state for repoduceability
            clf = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)

            # fit the classifier to the training data
            clf = clf.fit(x_train, y_train)

            #####################
            ### Model Results ###
            #####################

            # create prediction results for the model's performance on the train sample
            y_pred = clf.predict(x_train)

            # get metrics

            # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
            dct = {'model_number': model_number, 
                   'metric_type': 'accuracy',
                   'score': sk.metrics.accuracy_score(y_train, y_pred)}
            model_results = model_results.append(dct, ignore_index=True)

            dct = {'model_number': model_number, 
                   'metric_type': 'precision',
                   'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
            model_results = model_results.append(dct, ignore_index=True)

            dct = {'model_number': model_number, 
                   'metric_type': 'recall',
                   'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
            model_results = model_results.append(dct, ignore_index=True)

            dct = {'model_number': model_number, 
                   'metric_type': 'f1_score',
                   'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
            model_results = model_results.append(dct, ignore_index=True)
            
    return model_number, model_info, model_results

In [23]:
model_number, model_info, model_results = decision_tree(train, target, positive, feature_sets,
                                                        model_number, model_info, model_results)

In [24]:
model_info

Unnamed: 0,model_number,model_type,features,max_depth
0,baseline_1,baseline_1,,
1,baseline_2,baseline_2,,
2,1,simple build year,,
3,2,decision tree,"[built_last_two_years, previously_listed, scal...",3.0
4,3,decision tree,"[built_last_two_years, previously_listed, scal...",4.0
5,4,decision tree,"[built_last_two_years, previously_listed, scal...",5.0
6,5,decision tree,"[built_last_two_years, previously_listed, scal...",6.0
7,6,decision tree,"[built_last_two_years, previously_listed, scal...",7.0
8,7,decision tree,"[built_last_two_years, previously_listed, scal...",8.0
9,8,decision tree,"[built_last_two_years, previously_listed, scal...",9.0


In [25]:
display_model_results(model_results)

model_number,1,2,3,4,5,6,7,8,9,baseline_1,baseline_2
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accuracy,0.974017,0.97535,0.975683,0.976682,0.977015,0.977015,0.977015,0.977015,0.978015,0.792472,0.207528
f1_score,0.940092,0.942724,0.943542,0.94582,0.946636,0.946636,0.946636,0.946636,0.948758,0.0,0.343724
precision,0.901325,0.910314,0.910448,0.913303,0.913433,0.913433,0.913433,0.913433,0.918797,0.0,0.207528
recall,0.982343,0.977528,0.979133,0.980738,0.982343,0.982343,0.982343,0.982343,0.980738,0.0,1.0


### Decision Tree - RFE

Next we'll attempt to use automated feature selection in the Decision Tree model to determine the most important features. 

In [26]:
def rfe_decision_tree(train,
                      target, 
                      positive, 
                      model_number, 
                      model_info, 
                      model_results):

    # all available features
    all_features = [col for col in train.columns if col != target]

    # separate each sample into x (features) and y (target) - for RFE
    x_train_rfe = train[all_features]
    y_train_rfe = train[target]

    # establish hyperparameter ranges
    min_n_features = 2
    max_n_features = 12

    min_max_depth = 3
    max_max_depth = 10

    # establish loops based on hyperparameter ranges
    count = 1
    for n_features in range(min_n_features, max_n_features + 1):
        for max_depth in range(min_max_depth, max_max_depth + 1):

            # cache completed model info / model results
            model_info.to_csv('model_info.csv')
            model_results.to_csv('model_results.csv')

            #####################################
            ### Recursive Feature Elimination ###
            #####################################

            # establish a decision tree classifier
            clf = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)

            # create the rfe object
            rfe = RFE(clf, n_features_to_select=n_features)

            # fit the data using RFE
            rfe.fit(x_train_rfe, y_train_rfe)

            # get list of the column names for the selected features
            features = x_train_rfe.iloc[:,rfe.support_].columns.tolist()

            ##################
            ### Model Info ###
            ##################

            # create a new model number by adding 1 to the previous model number
            model_number += 1
            # establish the model type
            model_type = 'decision tree'

            # store info about the model

            # create a dictionary containing the features and hyperparamters used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'features': features,
                   'max_depth': max_depth}
            # append that dictionary to the model_info dataframe
            model_info = model_info.append(dct, ignore_index=True)

            ################
            ### Modeling ###
            ################

            # separate each sample into x (features) and y (target)
            x_train = train[features]
            y_train = train[target]

            # create the classifer

            # establish a decision tree classifier with the given max depth
            # set a random state for repoduceability
            clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)

            # fit the classifier to the training data
            clf = clf.fit(x_train, y_train)

            #####################
            ### Model Results ###
            #####################

            ####### train #######

            # create prediction results for the model's performance on the train sample
            y_pred = clf.predict(x_train)

            # get metrics

            # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
            dct = {'model_number': model_number, 
                   'metric_type': 'accuracy',
                   'score': sk.metrics.accuracy_score(y_train, y_pred)}
            model_results = model_results.append(dct, ignore_index=True)

            dct = {'model_number': model_number, 
                   'metric_type': 'precision',
                   'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
            model_results = model_results.append(dct, ignore_index=True)

            dct = {'model_number': model_number, 
                   'metric_type': 'recall',
                   'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
            model_results = model_results.append(dct, ignore_index=True)

            dct = {'model_number': model_number, 
                   'metric_type': 'f1_score',
                   'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
            model_results = model_results.append(dct, ignore_index=True)

            
    return model_number, model_info, model_results


In [27]:
model_number,  model_info, model_results = rfe_decision_tree(train, target, positive, 
                                                             model_number, model_info, model_results)

In [28]:
model_info

Unnamed: 0,model_number,model_type,features,max_depth
0,baseline_1,baseline_1,,
1,baseline_2,baseline_2,,
2,1,simple build year,,
3,2,decision tree,"[built_last_two_years, previously_listed, scal...",3.0
4,3,decision tree,"[built_last_two_years, previously_listed, scal...",4.0
5,4,decision tree,"[built_last_two_years, previously_listed, scal...",5.0
6,5,decision tree,"[built_last_two_years, previously_listed, scal...",6.0
7,6,decision tree,"[built_last_two_years, previously_listed, scal...",7.0
8,7,decision tree,"[built_last_two_years, previously_listed, scal...",8.0
9,8,decision tree,"[built_last_two_years, previously_listed, scal...",9.0


In [29]:
display_model_results(model_results)

model_number,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,baseline_1,baseline_2
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
accuracy,0.974017,0.97535,0.975683,0.976682,0.977015,0.977015,0.977015,0.977015,0.978015,0.988008,0.990007,0.990007,0.990673,0.994337,0.995336,0.996336,0.997668,0.989674,0.990007,0.99034,0.990673,0.996336,0.996669,0.999334,0.999667,0.989674,0.99034,0.991006,0.992672,0.994337,0.996003,0.997002,0.998001,0.989674,0.991006,0.991672,0.992672,0.993338,0.99567,0.996003,0.997335,0.989674,0.991006,0.992338,0.993671,0.994337,0.99567,0.996336,0.997668,0.989674,0.991339,0.992672,0.993338,0.99467,0.99567,0.996336,0.997668,0.989674,0.991006,0.992672,0.993671,0.995003,0.99567,0.996336,0.997668,0.989674,0.991672,0.993338,0.994337,0.995003,0.99567,0.996336,0.998001,0.989674,0.991672,0.993338,0.99467,0.995336,0.99567,0.996336,0.997668,0.989674,0.991672,0.993338,0.99467,0.995336,0.99567,0.997002,0.997668,0.989674,0.991672,0.993338,0.99467,0.995336,0.996336,0.997668,0.998001,0.792472,0.207528
f1_score,0.940092,0.942724,0.943542,0.94582,0.946636,0.946636,0.946636,0.946636,0.948758,0.970636,0.97561,0.97561,0.977199,0.986235,0.988673,0.991108,0.99435,0.974899,0.975728,0.976556,0.977199,0.99115,0.991935,0.998395,0.999197,0.97502,0.976594,0.978102,0.982172,0.986279,0.990307,0.992748,0.995169,0.97502,0.978208,0.97979,0.982172,0.983819,0.989508,0.990307,0.993548,0.97502,0.978278,0.981437,0.984615,0.986257,0.989508,0.991122,0.99435,0.97502,0.979066,0.982229,0.983793,0.987076,0.989508,0.991122,0.99435,0.97502,0.978278,0.982229,0.984615,0.987893,0.989508,0.991122,0.99435,0.97502,0.979855,0.983819,0.986301,0.987874,0.989508,0.991108,0.995161,0.97502,0.979855,0.983819,0.987097,0.988691,0.989491,0.991122,0.994359,0.97502,0.979855,0.983819,0.987076,0.988691,0.989508,0.992748,0.994359,0.97502,0.979855,0.983819,0.987076,0.988691,0.991122,0.99435,0.995161,0.0,0.343724
precision,0.901325,0.910314,0.910448,0.913303,0.913433,0.913433,0.913433,0.913433,0.918797,0.986733,0.988468,0.988468,0.991736,0.995098,0.996737,0.998371,1.0,0.98366,0.983687,0.983713,0.991736,0.993548,0.996759,0.998395,1.0,0.978964,0.982143,0.988525,0.991817,0.991883,0.996748,0.996764,0.998384,0.978964,0.983766,0.986971,0.991817,0.991843,0.99513,0.996748,0.998379,0.978964,0.980645,0.987013,0.993464,0.993485,0.99513,0.996753,1.0,0.978964,0.982229,0.988618,0.993453,0.993496,0.99513,0.996753,1.0,0.978964,0.980645,0.988618,0.993464,0.993506,0.99513,0.996753,1.0,0.978964,0.983819,0.991843,0.990291,0.995114,0.99513,0.998371,1.0,0.978964,0.983819,0.991843,0.991896,0.995122,0.996743,0.996753,0.998382,0.978964,0.983819,0.991843,0.993496,0.995122,0.99513,0.996764,0.998382,0.978964,0.983819,0.991843,0.993496,0.995122,0.996753,1.0,1.0,0.0,0.207528
recall,0.982343,0.977528,0.979133,0.980738,0.982343,0.982343,0.982343,0.982343,0.980738,0.955056,0.963082,0.963082,0.963082,0.977528,0.980738,0.983949,0.988764,0.966292,0.967897,0.969502,0.963082,0.988764,0.987159,0.998395,0.998395,0.971108,0.971108,0.967897,0.972713,0.980738,0.983949,0.988764,0.991974,0.971108,0.972713,0.972713,0.972713,0.975923,0.983949,0.983949,0.988764,0.971108,0.975923,0.975923,0.975923,0.979133,0.983949,0.985554,0.988764,0.971108,0.975923,0.975923,0.974318,0.980738,0.983949,0.985554,0.988764,0.971108,0.975923,0.975923,0.975923,0.982343,0.983949,0.985554,0.988764,0.971108,0.975923,0.975923,0.982343,0.980738,0.983949,0.983949,0.990369,0.971108,0.975923,0.975923,0.982343,0.982343,0.982343,0.985554,0.990369,0.971108,0.975923,0.975923,0.980738,0.982343,0.983949,0.988764,0.990369,0.971108,0.975923,0.975923,0.980738,0.982343,0.985554,0.988764,0.990369,0.0,1.0


In [30]:
model_results

Unnamed: 0,model_number,metric_type,score
0,baseline_1,accuracy,0.792472
1,baseline_1,precision,0.000000
2,baseline_1,recall,0.000000
3,baseline_1,f1_score,0.000000
4,baseline_2,accuracy,0.207528
...,...,...,...
391,96,f1_score,0.994350
392,97,accuracy,0.998001
393,97,precision,1.000000
394,97,recall,0.990369


In [38]:
def get_best_model_results(model_results, metric_type='accuracy', n_models=4):
    '''
    This function takes in the model_results dataframe. This is a dataframe in tidy 
    data format containing the following data for each model created in the project:
    - model number
    - metric type (accuracy, precision, recall, f1 score)
    - sample type (train, validate)
    - score (the score for the given metric and sample types)

    The function identifies the {n_models} models with the highest scores for the given metric
    type, as measured on the validate sample.

    It returns a dataframe of information about those models' performance in the tidy data format
    (as described above). 

    The resulting dataframe can be fed into the display_model_results function for convenient display formatting.
    '''
    # create an array of model numbers for the best performing models
    # by filtering the model_results dataframe for only validate scores for the given metric type
    best_models = (model_results[(model_results.metric_type == metric_type)]                                                 # sort by score value in descending order
                                                 .sort_values(by='score', 
                                                              ascending=False)
                                                 # take only the model number for the top n_models
                                                 .head(n_models)
                                                 .model_number
                                                 # and take only the values from the resulting dataframe as an array
                                                 .values)
    # create a dataframe of model_results for the models identified above
    # by filtering the model_results dataframe for only the model_numbers in the best_models array
    # TODO: make this so that it will return n_models, rather than only 4 models
    best_model_results = model_results[(model_results.model_number == best_models[0]) 
                                     | (model_results.model_number == best_models[1]) 
                                     | (model_results.model_number == best_models[2])
                                     | (model_results.model_number == best_models[3])]

    return best_model_results

In [45]:
display_model_results(get_best_model_results(model_results, metric_type='recall')).T

metric_type,accuracy,f1_score,precision,recall
model_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24,0.999334,0.998395,0.998395,0.998395
25,0.999667,0.999197,1.0,0.998395
33,0.998001,0.995169,0.998384,0.991974
baseline_2,0.207528,0.343724,0.207528,1.0


In [42]:
model_info[model_info.model_number.isin([24, 25, 33])]

Unnamed: 0,model_number,model_type,features,max_depth
25,24,decision tree,"[originallistprice_scaled, scaled_livingarea, ...",9.0
26,25,decision tree,"[originallistprice_scaled, scaled_livingarea, ...",10.0
34,33,decision tree,"[originallistprice_scaled, scaled_livingarea, ...",10.0


### Random Forest - RFE