In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)

from prepare import wrangle_data, split_data

random_state = 42

In [2]:
df, sale_df, rent_df = wrangle_data()

In [3]:
target = 'newconstructionyn'
positive = 1

## Prep for Modeling

### Drop columns not used in modeling

In [4]:
cols_to_drop = [
                'address_id',               # unique identifier not useful
                'listingcontractdate',      # we'll use engineered date features instead
                'originallistprice',         # we'll use the scaled prices instead
                'originallistprice_persqft', # we'll use the scaled prices instead
                'originallistprice_scaled', # we'll use the persqft prices instead
                ]

df = df.drop(columns=cols_to_drop)

### Encode categorical variables

In [5]:
def encode_data(df):
    '''
    This function takes in our dataset and encodes a given set of 
    categorical features using pandas one-hot encoder. It drops
    the original un-encoded columns and returns the df. 
    '''
    # categorical variables (that aren't already binary True/False)
    cols_to_encode = [
                      'propertytype', 
                      'propertysubtype', 
                     ]
    
    # create encoded column for each feature
    for col in cols_to_encode:
        dummy_df = pd.get_dummies(df[col],
                                  prefix=df[col].name,
                                  drop_first=True,
                                  dummy_na=False)
        # add encoded column to df
        df = pd.concat([df, dummy_df], axis=1)
        # drop original column
        df = df.drop(columns=col)
        
    return df

df = encode_data(df)

### Turn boolean columns into 1/0

In [6]:
def convert_bools(df):
    '''
    This function takes in our dataset and converts all boolean columns to 1 or 0
    numeric datatypes, then returns the df.
    '''
    # identify boolean columns
    bools = [col for col in df.columns if df[col].dtype == 'bool']
    # convert to 1 or 0
    for col in bools:
        df[col] = df[col].map({True: 1, False: 0})
    return df

df = convert_bools(df)

### Split the Data: Train & Test

In [7]:
train, test = split_data(df)

### Scale quantitative variables


In [8]:
def scale_data(train, test, scaler_type=MinMaxScaler()):
    '''
    This takes in the train and test dataframes. 

    It then fits a scaler object to the train sample based on the given sample_type, applies that
    scaler to the trainand test samples, and appends the new scaled data to the 
    dataframes as additional columns with the prefix 'scaled_'. 

    train and test dataframes are returned, in that order. 
    '''
    # identify quantitative features to scale (that aren't already scaled)
    cols_to_scale = [
                     'lotsizearea', 
                     'bedroomstotal', 
                     'bathroomstotalinteger',
                     'bathroomsfull',
                     'bathroomshalf', 
                     'livingarea',
                     'stories', 
                     'yearbuilt',
                     'years_since_build', 
                     'garage_size', 
                     'central_cooling_units', 
                     'windowwall_cooling_units',
                     'listing_month',
                     'listing_dayofmonth', 
                     'listing_dayofweek'
                    ]
    
    # establish empty dataframes for storing scaled dataset
    train_scaled = pd.DataFrame(index=train.index)
    test_scaled = pd.DataFrame(index=test.index)
    
    # screate and fit the scaler
    scaler = scaler_type.fit(train[cols_to_scale])
    
    # adding scaled features to scaled dataframes
    train_scaled[cols_to_scale] = scaler.transform(train[cols_to_scale])
    test_scaled[cols_to_scale] = scaler.transform(test[cols_to_scale])
    
    # add 'scaled' prefix to columns
    for feature in cols_to_scale:
        train_scaled = train_scaled.rename(columns={feature: f'scaled_{feature}'})
        test_scaled = test_scaled.rename(columns={feature: f'scaled_{feature}'})
        
    # concat scaled feature columns to original train and test df's
    train = pd.concat([train, train_scaled], axis=1)
    test = pd.concat([test, test_scaled], axis=1)
    
    # drop the original columns
    train = train.drop(columns=cols_to_scale)
    test = test.drop(columns=cols_to_scale)

    return train, test

train, test = scale_data(train, test)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3002 entries, 1931 to 1859
Data columns (total 65 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   garageyn                                3002 non-null   int64  
 1   newconstructionyn                       3002 non-null   int64  
 2   listed_on_weekend                       3002 non-null   int64  
 3   previously_listed                       3002 non-null   int64  
 4   built_last_two_years                    3002 non-null   int64  
 5   parkingfeatures_attached                3002 non-null   int64  
 6   parkingfeatures_detached                3002 non-null   int64  
 7   parkingfeatures_oversized               3002 non-null   int64  
 8   parkingfeatures_converted               3002 non-null   int64  
 9   parkingfeatures_sideentry               3002 non-null   int64  
 10  parkingfeatures_rearentry               3002 non-null   i

## Baseline Predictions

#### A simple baseline - predicting the most common class

Since the majority of properties are not new construction, our simplest baseline would be to predict 0 for each property. 

In [10]:
def run_baseline_1(train,
                   target,
                   positive,
                   model_number,
                   model_info,
                   model_results):
    '''
    This function takes in the train sample, the target variable label, the positive condition label,
    an initialized model_number variable, as well as model_info and model_results dataframes dataframes that will be used for 
    storing information about the models. It then performs the operations necessary for making baseline predictions
    on our dataset, and stores information about our baseline model in the model_info and model_results dataframes. 
    (i.e. predicts the most common class)
    The model_number, model_info, and model_results variables are returned (in that order). 
    '''

    # separate each sample into x (features) and y (target)
    x_train = train.drop(columns=target)
    y_train = train[target]


    # store baseline metrics

    # identify model number
    model_number = 'baseline_1'
    #identify model type
    model_type = 'baseline_1'

    # store info about the model

    # create a dictionary containing model number and model type
    dct = {'model_number': model_number,
           'model_type': model_type}
    # append that dictionary to the model_info dataframe
    model_info = model_info.append(dct, ignore_index=True)

    # establish baseline predictions for train sample
    y_pred = pd.Series([train[target].mode()[0]]).repeat(len(train))

    # get metrics

    # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
    dct = {'model_number': model_number, 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    # set the model number to from 'baseline' to 0 
    model_number = 0
    
    return model_number, model_info, model_results

In [11]:
# setting up the infrastructure to store information about our models
model_number = 0
model_info = pd.DataFrame()
model_results = pd.DataFrame()

In [12]:
model_number, model_info, model_results = run_baseline_1(train, target, positive,
                                                         model_number, model_info, model_results)

#### An alternative baseline - predicting the positive class

However, if our goal is to maximize recall, i.e. identify as many new construction as possible, a more reasonable baseline might be to treat all properties as if they are new construction (i.e. always predict 1).

In [13]:
def run_baseline_2(train,
                   target,
                   positive,
                   model_number,
                   model_info,
                   model_results):
    '''
    This function takes in the train sample, the target variable label, the positive condition label,
    an initialized model_number variable, as well as model_info and model_results dataframes dataframes that will be used for 
    storing information about the models. It then performs the operations necessary for making baseline predictions
    on our dataset, and stores information about our baseline model in the model_info and model_results dataframes. 
    The model_number, model_info, and model_results variables are returned (in that order). 
    
    For this alternative baseline, we will maximize recall by always predicting 1.
    '''

    # separate each sample into x (features) and y (target)
    x_train = train.drop(columns=target)
    y_train = train[target]

    # store baseline metrics

    # identify model number
    model_number = 'baseline_2'
    #identify model type
    model_type = 'baseline_2'

    # store info about the model

    # create a dictionary containing model number and model type
    dct = {'model_number': model_number,
           'model_type': model_type}
    # append that dictionary to the model_info dataframe
    model_info = model_info.append(dct, ignore_index=True)

    # establish baseline predictions for train sample
    y_pred = pd.Series(1).repeat(len(train))

    # get metrics

    # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
    dct = {'model_number': model_number, 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    # set the model number to from 'baseline' to 0 
    model_number = 0
    
    return model_number, model_info, model_results

In [14]:
model_number, model_info, model_results = run_baseline_2(train, target, positive,
                                                         model_number, model_info, model_results)

In [15]:
model_results

Unnamed: 0,model_number,metric_type,score
0,baseline_1,accuracy,0.792472
1,baseline_1,precision,0.0
2,baseline_1,recall,0.0
3,baseline_1,f1_score,0.0
4,baseline_2,accuracy,0.207528
5,baseline_2,precision,0.207528
6,baseline_2,recall,1.0
7,baseline_2,f1_score,0.343724


## An Intuitive Baseline Model

#### Using the most intuitive way of determining new construction as a more effective baseline

Exploration found that our most reliable feature is whether the build year of the property is within two calendar years of the listing date. This is also very intuitive - any reasonable person looking at a listing and trying to guess whether it was new construction would look for a recent build year. If our eventual model cannot predict more reliably than that, it will not be of much use. 

In [16]:
def run_model_1(train,
                target,
                positive,
                model_number,
                model_info,
                model_results):
    '''
    This function predicts whether a property is new construction based only on whether the build year is within
    two calendar years of the listing. This will create a more effective and useful baseline for which to compare
    future, more complex models. 
    
    This function takes in the train sample, the target variable label, the positive condition label,
    as well as the model_number variable and model_info and model_results dataframes. It then updates and returns
    the model_number, model_info, and model_results variables after creating and storing info about the model 
    described above.
    '''

    # separate each sample into x (features) and y (target)
    x_train = train.drop(columns=target)
    y_train = train[target]

    # identify model number
    model_number = 1
    #identify model type
    model_type = 'simple build year'

    # store info about the model

    # create a dictionary containing model number and model type
    dct = {'model_number': model_number,
           'model_type': model_type}
    # append that dictionary to the model_info dataframe
    model_info = model_info.append(dct, ignore_index=True)

    # establish predictions for train sample
    y_pred = train.built_last_two_years

    # get metrics

    # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
    dct = {'model_number': model_number, 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)
    
    return model_number, model_info, model_results

In [17]:
model_number, model_info, model_results = run_model_1(train, target, positive,
                                                      model_number, model_info, model_results)

In [18]:
def display_model_results(model_results):
    '''
    This function takes in the model_results dataframe. This is a dataframe in tidy data format 
    containing the following information for each model created in the project:
    - model number
    - metric type (accuracy, precision, recall, f1 score)
    - sample type (train, validate)
    - score (the score for the given metric and sample types)
    The function returns a pivot table of those values for easy comparison of models, metrics, and samples. 
    '''
    # create a pivot table of the model_results dataframe
    # establish columns as the model_number, with index as metric_type, and values as score
    # the aggfunc uses a lambda to return each individual score without any aggregation applied
    return model_results.pivot_table(columns='model_number', 
                                     index=('metric_type'), 
                                     values='score',
                                     aggfunc=lambda x: x)

In [19]:
model_results

Unnamed: 0,model_number,metric_type,score
0,baseline_1,accuracy,0.792472
1,baseline_1,precision,0.0
2,baseline_1,recall,0.0
3,baseline_1,f1_score,0.0
4,baseline_2,accuracy,0.207528
5,baseline_2,precision,0.207528
6,baseline_2,recall,1.0
7,baseline_2,f1_score,0.343724
8,1,accuracy,0.974017
9,1,precision,0.901325


In [20]:
display_model_results(model_results).T

metric_type,accuracy,f1_score,precision,recall
model_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.974017,0.940092,0.901325,0.982343
baseline_1,0.792472,0.0,0.0,0.0
baseline_2,0.207528,0.343724,0.207528,1.0


## ML Modeling

In [21]:
# for this first iteration we will use only those which were observed to be potentially useful
# during exploration
features = ['built_last_two_years',
             'previously_listed',
             'scaled_stories',
             'cooling_windowwall',
             'originallistprice_persqft_scaled'
            ]

### Decision Tree

In [22]:
def decision_tree(train, target, features):
    # split dataset into x (features) and y (target)
    x_train = train[features]
    y_train = train[target]

    # identify model_type
    model_type = 'decision tree'

    # set hyperparameter ranges
    parameter_space = {'max_depth': [2,3,4,5,6,7]}

    # create the classifier
    clf = DecisionTreeClassifier()

    # define scoring methods
    scoring = {'recall': make_scorer(sk.metrics.accuracy_score),
               'precision': make_scorer(sk.metrics.precision_score),
               'accuracy': make_scorer(sk.metrics.accuracy_score),
               'f1_score': make_scorer(sk.metrics.f1_score)}

    # create and fit the GridSearchCV object
    grid = GridSearchCV(clf, parameter_space, cv=5, 
                        scoring=scoring,
                        refit='recall')
    grid.fit(x_train, y_train)


    # get results and store as dataframe

    results = grid.cv_results_

    params = results['params']
    accuracy = results['mean_test_accuracy'] 
    recall = results['mean_test_recall']
    precision = results['mean_test_precision']
    F1_score = results['mean_test_f1_score']

    for par, acc, rec, prec, f1 in zip(params, accuracy, recall, precision, F1_score):
        par['model_type'] = model_type
        par['features'] = features
        par['accuracy'] = acc
        par['recall'] = rec
        par['precision'] = prec
        par['F1_score'] = f1

    decision_tree_results = pd.DataFrame(params)
    
    return decision_tree_results


In [23]:
decision_tree(train, target, features)

Unnamed: 0,max_depth,model_type,features,accuracy,recall,precision,F1_score
0,2,decision tree,"[built_last_two_years, previously_listed, scal...",0.974016,0.974016,0.901575,0.940145
1,3,decision tree,"[built_last_two_years, previously_listed, scal...",0.974682,0.974682,0.905698,0.941562
2,4,decision tree,"[built_last_two_years, previously_listed, scal...",0.975348,0.975348,0.910573,0.942777
3,5,decision tree,"[built_last_two_years, previously_listed, scal...",0.975015,0.975015,0.91041,0.941951
4,6,decision tree,"[built_last_two_years, previously_listed, scal...",0.973683,0.973683,0.908771,0.938782
5,7,decision tree,"[built_last_two_years, previously_listed, scal...",0.973016,0.973016,0.908477,0.937122


### Random Forest

In [24]:
def random_forest(train, target, features):
    # split dataset into x (features) and y (target)
    x_train = train[features]
    y_train = train[target]

    # identify model_type
    model_type = 'random forest'

    # set hyperparameter ranges
    parameter_space = {'max_depth': [2,3,4,5,6,7],
                       'min_samples_leaf': [2,3,4]}

    # create the classifier
    clf = RandomForestClassifier()

    # define scoring methods
    scoring = {'recall': make_scorer(sk.metrics.accuracy_score),
               'precision': make_scorer(sk.metrics.precision_score),
               'accuracy': make_scorer(sk.metrics.accuracy_score),
               'f1_score': make_scorer(sk.metrics.f1_score)}

    # create and fit the GridSearchCV object
    grid = GridSearchCV(clf, parameter_space, cv=5, 
                        scoring=scoring,
                        refit='recall')
    grid.fit(x_train, y_train)


    # get results and store as dataframe

    results = grid.cv_results_

    params = results['params']
    accuracy = results['mean_test_accuracy'] 
    recall = results['mean_test_recall']
    precision = results['mean_test_precision']
    F1_score = results['mean_test_f1_score']

    for par, acc, rec, prec, f1 in zip(params, accuracy, recall, precision, F1_score):
        par['model_type'] = model_type
        par['features'] = features
        par['accuracy'] = acc
        par['recall'] = rec
        par['precision'] = prec
        par['F1_score'] = f1

    random_forest_results = pd.DataFrame(params)
    
    return random_forest_results

In [25]:
random_forest(train, target, features)

Unnamed: 0,max_depth,min_samples_leaf,model_type,features,accuracy,recall,precision,F1_score
0,2,2,random forest,"[built_last_two_years, previously_listed, scal...",0.974016,0.974016,0.901575,0.940145
1,2,3,random forest,"[built_last_two_years, previously_listed, scal...",0.972019,0.972019,0.900865,0.935161
2,2,4,random forest,"[built_last_two_years, previously_listed, scal...",0.970688,0.970688,0.900268,0.931818
3,3,2,random forest,"[built_last_two_years, previously_listed, scal...",0.97335,0.97335,0.902464,0.93841
4,3,3,random forest,"[built_last_two_years, previously_listed, scal...",0.973683,0.973683,0.90261,0.939236
5,3,4,random forest,"[built_last_two_years, previously_listed, scal...",0.97335,0.97335,0.902464,0.93841
6,4,2,random forest,"[built_last_two_years, previously_listed, scal...",0.973683,0.973683,0.903726,0.939112
7,4,3,random forest,"[built_last_two_years, previously_listed, scal...",0.97335,0.97335,0.902464,0.93841
8,4,4,random forest,"[built_last_two_years, previously_listed, scal...",0.97335,0.97335,0.902464,0.93841
9,5,2,random forest,"[built_last_two_years, previously_listed, scal...",0.97335,0.97335,0.903717,0.938319


### Logistic Regression

In [26]:
def log_regression(train, target, features):
    # split dataset into x (features) and y (target)
    x_train = train[features]
    y_train = train[target]

    # identify model_type
    model_type = 'logistic regression'

    # set hyperparameter ranges
    parameter_space = {'C': [.001, .01, .1, 1, 10, 100, 1000]}

    # create the classifier
    clf = LogisticRegression()

    # define scoring methods
    scoring = {'recall': make_scorer(sk.metrics.accuracy_score),
               'precision': make_scorer(sk.metrics.precision_score),
               'accuracy': make_scorer(sk.metrics.accuracy_score),
               'f1_score': make_scorer(sk.metrics.f1_score)}

    # create and fit the GridSearchCV object
    grid = GridSearchCV(clf, parameter_space, cv=5, 
                        scoring=scoring,
                        refit='recall')
    grid.fit(x_train, y_train)


    # get results and store as dataframe

    results = grid.cv_results_

    params = results['params']
    accuracy = results['mean_test_accuracy'] 
    recall = results['mean_test_recall']
    precision = results['mean_test_precision']
    F1_score = results['mean_test_f1_score']

    for par, acc, rec, prec, f1 in zip(params, accuracy, recall, precision, F1_score):
        par['model_type'] = model_type
        par['features'] = features
        par['accuracy'] = acc
        par['recall'] = rec
        par['precision'] = prec
        par['F1_score'] = f1

    log_regression_results = pd.DataFrame(params)
    
    return log_regression_results


In [27]:
log_regression(train, target, features)

Unnamed: 0,C,model_type,features,accuracy,recall,precision,F1_score
0,0.001,logistic regression,"[built_last_two_years, previously_listed, scal...",0.792472,0.792472,0.0,0.0
1,0.01,logistic regression,"[built_last_two_years, previously_listed, scal...",0.971354,0.971354,0.911647,0.93261
2,0.1,logistic regression,"[built_last_two_years, previously_listed, scal...",0.974016,0.974016,0.901575,0.940145
3,1.0,logistic regression,"[built_last_two_years, previously_listed, scal...",0.974016,0.974016,0.901575,0.940145
4,10.0,logistic regression,"[built_last_two_years, previously_listed, scal...",0.973683,0.973683,0.901425,0.939331
5,100.0,logistic regression,"[built_last_two_years, previously_listed, scal...",0.973683,0.973683,0.901425,0.939331
6,1000.0,logistic regression,"[built_last_two_years, previously_listed, scal...",0.973683,0.973683,0.901425,0.939331
