In [66]:
# Standard imports
import numpy as np
import pandas as pd
import plotly.express as px

pd.set_option("display.max_columns", 200)

In [67]:
# scikit learn froms
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

In [68]:
file = "./Data/dwellings_ml_ready.csv"

df = pd.read_csv(file)
df.head()

Unnamed: 0,parcel,abstrprd,livearea,finbsmnt,basement,yrbuilt,totunits,stories,nocars,numbdrm,numbaths,sprice,deduct,netprice,tasp,smonth,syear,condition_AVG,condition_Excel,condition_Fair,condition_Good,condition_VGood,quality_A,quality_B,quality_C,quality_D,quality_X,gartype_Att,gartype_Att/Det,gartype_CP,gartype_Det,gartype_None,gartype_att/CP,gartype_det/CP,arcstyle_BI-LEVEL,arcstyle_CONVERSIONS,arcstyle_END UNIT,arcstyle_MIDDLE UNIT,arcstyle_ONE AND HALF-STORY,arcstyle_ONE-STORY,arcstyle_SPLIT LEVEL,arcstyle_THREE-STORY,arcstyle_TRI-LEVEL,arcstyle_TRI-LEVEL WITH BASEMENT,arcstyle_TWO AND HALF-STORY,arcstyle_TWO-STORY,qualified_Q,qualified_U,status_I,status_V,before1980
0,00102-08-065-065,1130,1346,0,0,2004,1,2,2,2,2,100000,0,100000,100000,2,2012,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
1,00102-08-073-073,1130,1249,0,0,2005,1,1,1,2,2,94700,0,94700,94700,4,2011,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,00102-08-078-078,1130,1346,0,0,2005,1,2,1,2,2,89500,0,89500,89500,10,2010,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
3,00102-08-081-081,1130,1146,0,0,2005,1,1,0,2,2,92000,3220,88780,88780,10,2011,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,00102-08-086-086,1130,1249,0,0,2005,1,1,1,2,2,74199,0,74199,74199,3,2012,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0


## Core Questions and Tasks

1. Create ***2-3*** charts that evaluate potential relationships between the home variables and `before1980`
   - Explain what you learn from the charts that could help a machine learning algorithm
2. Build a classification model labeling houses as being built *before 1980* or *during or after 1980*
   - Your goal is to reach or exceed 90% accuracy
   - Explain your final model choice^[Algorithm, Tuning Parameters, etc.] and describe what other models you tried
3. Justify your classification model by discussing the most important features selected by your model
   - This discussion should include a feature importance chart and a description of the features
4. Describe the quality of your classification model using ***2-3*** different evaluation metrics
   - You also need to explain how to interpret each of the evaluation metrics you use

## Deliverables

1. A short elevator pitch that highlights key values or metrics from the results
   - Describing these key insights to interest or hook the reader to want to read more about your work. The writing style should be more technical with some creative elements
   - Do not summarize what you did
2. Answers to the ***Questions and Tasks***
   - Each answer should include a written description of your:
     - Results 
     - Code Cells^[With Comments]
     - Charts 
     - Tables

In [69]:
df.describe()

Unnamed: 0,abstrprd,livearea,finbsmnt,basement,yrbuilt,totunits,stories,nocars,numbdrm,numbaths,sprice,deduct,netprice,tasp,smonth,syear,condition_AVG,condition_Excel,condition_Fair,condition_Good,condition_VGood,quality_A,quality_B,quality_C,quality_D,quality_X,gartype_Att,gartype_Att/Det,gartype_CP,gartype_Det,gartype_None,gartype_att/CP,gartype_det/CP,arcstyle_BI-LEVEL,arcstyle_CONVERSIONS,arcstyle_END UNIT,arcstyle_MIDDLE UNIT,arcstyle_ONE AND HALF-STORY,arcstyle_ONE-STORY,arcstyle_SPLIT LEVEL,arcstyle_THREE-STORY,arcstyle_TRI-LEVEL,arcstyle_TRI-LEVEL WITH BASEMENT,arcstyle_TWO AND HALF-STORY,arcstyle_TWO-STORY,qualified_Q,qualified_U,status_I,status_V,before1980
count,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0,22913.0
mean,1118.980099,1510.783485,330.511936,524.501942,1963.685506,1.018461,1.419849,1.362589,2.522062,2.351416,561859.6,1336.94885,560522.7,563820.7,6.540348,2011.064723,0.777157,0.001091,4.4e-05,0.205822,0.015886,0.041461,0.25994,0.674595,0.017239,0.006765,0.372932,0.009165,0.003753,0.304412,0.304543,0.000655,0.004539,0.008161,0.005543,0.195391,0.133811,0.03618,0.375682,0.001658,0.001615,0.011827,0.01401,0.007201,0.208921,0.673853,0.326147,0.945926,0.054074,0.624929
std,152.603144,787.018142,469.912502,567.653028,36.9287,0.256309,0.568886,1.097619,0.884602,1.117339,2135231.0,3061.789547,2135373.0,2135272.0,3.268072,0.708803,0.416163,0.033014,0.006606,0.40431,0.125038,0.199359,0.438611,0.468536,0.130164,0.081971,0.483595,0.095297,0.061151,0.460168,0.460224,0.025578,0.06722,0.089973,0.074244,0.39651,0.340456,0.186743,0.484309,0.040691,0.040153,0.108111,0.117532,0.084555,0.406546,0.468812,0.468812,0.226169,0.226169,0.484152
min,100.0,293.0,0.0,0.0,1873.0,0.0,1.0,0.0,0.0,0.0,1000.0,0.0,1000.0,1000.0,1.0,2010.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1112.0,975.0,0.0,0.0,1940.0,1.0,1.0,0.0,2.0,2.0,134000.0,0.0,132000.0,132890.0,4.0,2011.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,1112.0,1302.0,0.0,401.0,1963.0,1.0,1.0,2.0,2.0,2.0,230000.0,0.0,229000.0,231000.0,6.0,2011.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,1114.0,1823.0,716.0,959.0,2002.0,1.0,2.0,2.0,3.0,3.0,378400.0,2383.0,375500.0,382184.0,9.0,2012.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
max,9279.0,21503.0,4320.0,9025.0,2013.0,10.0,4.0,17.0,9.0,11.0,21750000.0,101809.0,21750000.0,21750000.0,12.0,2012.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [70]:
df_cols = sorted(df.columns)
df_cols

['abstrprd',
 'arcstyle_BI-LEVEL',
 'arcstyle_CONVERSIONS',
 'arcstyle_END UNIT',
 'arcstyle_MIDDLE UNIT',
 'arcstyle_ONE AND HALF-STORY',
 'arcstyle_ONE-STORY',
 'arcstyle_SPLIT LEVEL',
 'arcstyle_THREE-STORY',
 'arcstyle_TRI-LEVEL',
 'arcstyle_TRI-LEVEL WITH BASEMENT',
 'arcstyle_TWO AND HALF-STORY',
 'arcstyle_TWO-STORY',
 'basement',
 'before1980',
 'condition_AVG',
 'condition_Excel',
 'condition_Fair',
 'condition_Good',
 'condition_VGood',
 'deduct',
 'finbsmnt',
 'gartype_Att',
 'gartype_Att/Det',
 'gartype_CP',
 'gartype_Det',
 'gartype_None',
 'gartype_att/CP',
 'gartype_det/CP',
 'livearea',
 'netprice',
 'nocars',
 'numbaths',
 'numbdrm',
 'parcel',
 'qualified_Q',
 'qualified_U',
 'quality_A',
 'quality_B',
 'quality_C',
 'quality_D',
 'quality_X',
 'smonth',
 'sprice',
 'status_I',
 'status_V',
 'stories',
 'syear',
 'tasp',
 'totunits',
 'yrbuilt']

In [71]:
df.dtypes

parcel                              object
abstrprd                             int64
livearea                             int64
finbsmnt                             int64
basement                             int64
yrbuilt                              int64
totunits                             int64
stories                              int64
nocars                               int64
numbdrm                              int64
numbaths                             int64
sprice                               int64
deduct                               int64
netprice                             int64
tasp                                 int64
smonth                               int64
syear                                int64
condition_AVG                        int64
condition_Excel                      int64
condition_Fair                       int64
condition_Good                       int64
condition_VGood                      int64
quality_A                            int64
quality_B  

In [72]:
def get_desired_columns(cols: list) -> list:

    reject_list = ["arcstyle", "condition", "gartype", "status", "quality", "qualified"]
    keep_list = []

    for c in cols:
        c_parts = c.split("_")
        if c_parts[0] not in reject_list:
            keep_list.append(c)
    return keep_list


desired_columns = get_desired_columns(sorted(df_cols))

h_subset = df.filter(items=desired_columns).sample(500)

chart = px.scatter_matrix(h_subset, dimensions=desired_columns, color="before1980")
chart.update_traces(diagonal_visible=False)
chart.show()

In [73]:
corr = h_subset.drop(columns="before1980").corr()
corr





Unnamed: 0,abstrprd,basement,deduct,finbsmnt,livearea,netprice,nocars,numbaths,numbdrm,smonth,sprice,stories,syear,tasp,totunits,yrbuilt
abstrprd,1.0,-0.506393,-0.149453,-0.389532,-0.206515,0.120302,-0.617569,-0.283301,-0.476025,-0.021752,0.120191,-0.107429,0.012602,0.120358,-0.043992,0.203039
basement,-0.506393,1.0,-0.004886,0.818492,0.493339,-0.056453,0.546538,0.55997,0.37607,0.002101,-0.056462,0.086049,0.039256,-0.056455,0.027599,0.017799
deduct,-0.149453,-0.004886,1.0,0.027824,-0.106621,-0.098715,0.051338,-0.078554,0.00376,0.0039,-0.097917,-0.113339,-0.027441,-0.098519,-0.031316,-0.161655
finbsmnt,-0.389532,0.818492,0.027824,1.0,0.345856,-0.067956,0.430154,0.515997,0.22478,0.021993,-0.067939,-0.023999,0.034484,-0.067904,0.025489,-0.02724
livearea,-0.206515,0.493339,-0.106621,0.345856,1.0,0.133725,0.537383,0.7877,0.610988,0.020873,0.13365,0.556095,0.044626,0.133952,0.061558,0.311414
netprice,0.120302,-0.056453,-0.098715,-0.067956,0.133725,1.0,-0.027955,0.106169,0.073613,0.006756,1.0,0.065646,-0.091414,0.999997,-0.010395,0.20958
nocars,-0.617569,0.546538,0.051338,0.430154,0.537383,-0.027955,1.0,0.580765,0.483184,0.059979,-0.027916,0.329077,0.015329,-0.028019,0.018877,0.149508
numbaths,-0.283301,0.55997,-0.078554,0.515997,0.7877,0.106169,0.580765,1.0,0.588728,0.048269,0.106114,0.557081,0.044931,0.106328,0.034286,0.392254
numbdrm,-0.476025,0.37607,0.00376,0.22478,0.610988,0.073613,0.483184,0.588728,1.0,0.070235,0.073622,0.400312,-0.025738,0.073654,0.019222,0.127438
smonth,-0.021752,0.002101,0.0039,0.021993,0.020873,0.006756,0.059979,0.048269,0.070235,1.0,0.00676,0.022518,-0.633918,0.006941,0.039173,0.040738


In [74]:
px.imshow(corr, text_auto=True)