In [1]:
import pandas as pd

%matplotlib inline 

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:99% !important; }</style>"))

In [2]:
import sklearn

In [3]:
from sklearn.metrics import roc_auc_score, roc_curve

# Overview

Each row represents a home sale.

We try to predict both home sales, and the types of sales.

When `audantic_target`=1, this is the type of transaction we are trying to predict.

```
'audantic_target': 1 or 0, if we are trying to predict

'pid': property id
'did': sales document id
'fips': county id
'zipcode': zipcode
'sfr': bool for is single family home

-- house details
'square_footage': 
'year_built'
'estimated_value'
'length_of_ownership'

-- owner details
'est_household_income_val': estimated income
'mosaic_hh_val': demographic variable for home
'mosaic_zip4_val': demographic variable for neighborhood
'mosaic_diff': difference between home and neighborhood demographics
```

# Import, investigating the data

In [4]:
df = pd.read_csv('df_all_ll.csv.gzip', compression='gzip')

In [5]:
df.head()

Unnamed: 0,audantic_target,pid,did,fips,zipcode,seller_occupied,square_footage,year_built,estimated_value,length_of_ownership,...,cat_e_e,cat_e,cat_e_r,cat_e_s,cat_g,cat_h,cat_j,cat_m,cat_n,cat_s
0,0,369,603808466,36103,11946.0,0,1260,1997,418000,13.0294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,383,574522015,36103,11767.0,1,0,0,301000,10.7515,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,960,605013654,36103,11704.0,1,0,0,263000,9.7029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,973,605452151,36103,11776.0,0,0,0,412000,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1136,602447312,36055,14586.0,1,1363,2000,172000,5.0021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.columns.values

array(['audantic_target', 'pid', 'did', 'fips', 'zipcode',
       'seller_occupied', 'square_footage', 'year_built',
       'estimated_value', 'length_of_ownership',
       'est_household_income_val', 'mosaic_hh_val', 'mosaic_zip4_val',
       'mosaic_diff', 'cat_a', 'cat_ce', 'cat_a_c', 'cat_a_i', 'cat_a_j',
       'cat_a_m', 'cat_a_sk', 'cat_e_b', 'cat_e_e', 'cat_e', 'cat_e_r',
       'cat_e_s', 'cat_g', 'cat_h', 'cat_j', 'cat_m', 'cat_n', 'cat_s'], dtype=object)

In [7]:
y_col = 'audantic_target'

In [8]:
X_cols = [
    'seller_occupied', 'square_footage', 'year_built',
    'estimated_value', 'length_of_ownership',
    'est_household_income_val', 'mosaic_hh_val', 'mosaic_zip4_val',
    'mosaic_diff', 'cat_a', 'cat_ce', 'cat_a_c', 'cat_a_i', 'cat_a_j',
    'cat_a_m', 'cat_a_sk', 'cat_e_b', 'cat_e_e', 'cat_e', 'cat_e_r',
    'cat_e_s', 'cat_g', 'cat_h', 'cat_j', 'cat_m', 'cat_n', 'cat_s']

In [9]:
df[X_cols].describe().round(3)

Unnamed: 0,seller_occupied,square_footage,year_built,estimated_value,length_of_ownership,est_household_income_val,mosaic_hh_val,mosaic_zip4_val,mosaic_diff,cat_a,...,cat_e_e,cat_e,cat_e_r,cat_e_s,cat_g,cat_h,cat_j,cat_m,cat_n,cat_s
count,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,...,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0,1233186.0
mean,0.581,1691.112,1827.824,309029.501,11.785,58345.574,41.564,35.86,-5.704,0.034,...,0.001,0.046,0.0,0.0,0.001,0.052,0.001,0.003,0.006,0.005
std,0.493,3085.879,516.699,249186.24,7.964,64498.681,25.046,23.495,18.843,0.181,...,0.032,0.21,0.004,0.012,0.024,0.222,0.031,0.054,0.077,0.069
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-71.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1160.0,1950.0,157000.0,3.822,0.0,19.0,16.0,-8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1557.0,1974.0,244000.0,11.672,40000.0,41.0,31.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,2119.0,1998.0,383000.0,20.0,87500.0,72.0,59.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,3241353.0,2018.0,3000000.0,66.875,300000.0,72.0,72.0,65.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
df[['audantic_target']].describe().T.round(3)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
audantic_target,1233186.0,0.136,0.343,0.0,0.0,0.0,0.0,1.0


# Model Building

# Model Evaluation