Load required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

plt.style.use('seaborn-white')
%matplotlib inline

Some plotting functions to help you plot the ROC curve

In [2]:
def auc_plotting_function(rate1,rate2,rate1_name,rate2_name, curve_name):
    AUC = auc(rate1, rate2)
    # Plot of an ROC curve for class 1 (has_cancer)
    plt.figure(figsize=[11,9])
    plt.plot(rate1, rate2, label=curve_name + ' (area = %0.2f)' % AUC, linewidth=4)
    plt.plot([0,1],[0,1], 'k--',linewidth=4)
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.0])
    plt.xlabel(rate1_name, fontsize=18)
    plt.ylabel(rate2_name, fontsize=18)
    plt.title(curve_name + ' for house price > 200,000', fontsize=18)
    plt.legend(loc='Lower right')
    plt.show()
    
def plot_roc(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_plotting_function(fpr, tpr, 'False Positive Rate', 'True Postive Rate', 'ROC')


# generic curve plotting function
# def auc_plotting_function(rate1, rate2, rate1_name, rate2_name, curve_name):
#     AUC = auc(rate1, rate2)
#     # Plot of a ROC curve for class 1 (has_cancer)
#     plt.figure(figsize=[11,9])
#     plt.plot(rate1, rate2, label=curve_name + ' (area = %0.2f)' % AUC, linewidth=4)
#     plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
#     plt.xlim([0.0, 1.0])
#     plt.ylim([0.0, 1.05])
#     plt.xlabel(rate1_name, fontsize=18)
#     plt.ylabel(rate2_name, fontsize=18)
#     plt.title(curve_name + ' for house price > 200,000', fontsize=18)
#     plt.legend(loc="lower right")
#     plt.show()

# # plot receiving operator characteristic curve
# def plot_roc(y_true, y_score):
#     fpr, tpr, _ = roc_curve(y_true, y_score)
#     auc_plotting_function(fpr, tpr, 'False Positive Rate', 'True Positive Rate', 'ROC')


Load the Sacramento housing data

In [3]:
sac = pd.read_csv('../assets/datasets/Sacramentorealestatetransactions.csv')
# sac = pd.read_csv('../assets/datasets/Sacramentorealestatetransactions.csv')

Create a binary variable where 1 indicates a house sold for over 200,000 and 0 indicates a house sold for equal to or less than 200,000.

Subset the data to just contain the number of beds, baths, the sq ft, and the over 200k indicator variable.

In [12]:
p = []
for i in sac['price']:
    if i > 200000:
        p.append(1)
    else:
        p.append(0)
    
sac['over 200'] = p
sac.head()


Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude,over 200
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879,0
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028,0
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839,0
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146,0
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768,0


In [18]:
predictors = ['beds', 'baths', 'sq__ft']
features=['over 200']

Split your data into training and testing sets. The predictors are the beds, baths, and sq ft. The feature is the over 200k class variable. Make the test size 33% (and optionally stratify by the over 200k class).

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import numpy as np
import pandas as pd

subset_mask = predictors
sac['metrics_pct'] = sac[subset_mask].sum(axis=1)

metrics_pct = np.array(sac.metrics_pct.values)
metrics_pct = metrics_pct[:, np.newaxis]

# stratify keeps our classes balanced
X_train, X_test, Y_train, Y_test = train_test_split(metrics_pct, sac[['beds','baths','sq__ft']].values, 
                                                    test_size=0.33, stratify=sac[['over 200']].values,
                                                    random_state=77)  
#random_state = seed (use the same test records every time)

Fit a logistic regression on the training data.

Print out the confusion matrix

In [None]:
#conmat = np.array(confusion_matrix(Y_test, Y_pred, labels=[1,0]))

#confusion = pd.DataFrame(conmat, index=['over_200k', 'under_200k'],
#                         columns=['predicted_over_200k','predicted_under_200k'])

#print(confusion)

Calculate the accuracy, precision, and recall. What can these three metrics tell you about your model?

Say as a real estate agent, I prioritize minimizing false positives (predicting a house will sell for over 200k when it actually sells for under) because false positives make me lose money.

Change the decision threshold to **lower the false positive rate** and then print out the new confusion matrix. What is the downside to lowering the false positive rate?

Plot the ROC curve using the plotting function provided.

In [None]:
Y_score = logreg.decision_function(X_test)
plot_roc(Y_test, Y_score)

Bonus: when might precision and recall be more useful than the ROC?

Precision and recall are more useful when the proportion of the positive class is smaller, since they are sensitive to this.