In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

import acquire, summarize, wrangle_zillow, model

In [2]:
# Acquire zillow dataset

# Acquire the zillow dataset

zillow = acquire.get_zillow_clustering()
zillow.shape

(77579, 69)

In [3]:
# Wrangle the zillow dataset

train, validate, test = wrangle_zillow.wrangle_zillow_clustering_2(zillow)

# Double check the shape of the splitted datasets
train.shape, validate.shape, test.shape

((31028, 19), (13299, 19), (7823, 19))

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31028 entries, 71142 to 59426
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      31028 non-null  int64  
 1   bathroomcnt                   31028 non-null  float64
 2   bedroomcnt                    31028 non-null  float64
 3   buildingqualitytypeid         31028 non-null  float64
 4   calculatedfinishedsquarefeet  31028 non-null  float64
 5   finishedsquarefeet12          31028 non-null  float64
 6   fips                          31028 non-null  object 
 7   fullbathcnt                   31028 non-null  float64
 8   latitude                      31028 non-null  float64
 9   longitude                     31028 non-null  float64
 10  lotsizesquarefeet             31028 non-null  float64
 11  roomcnt                       31028 non-null  float64
 12  yearbuilt                     31028 non-null  float64
 1

In [7]:
# Create masks for 4 subgroups

train.loc[:, ['fips', 'error_type']].head()

Unnamed: 0,fips,error_type
71142,Ventura,1
11459,Los Angeles,1
54330,Orange,0
46830,Los Angeles,0
53521,Orange,0


In [11]:
mask_1 = (train.error_type == 1) & (train.fips != 'Ventura')
mask_2 = (train.error_type == 1) & (train.fips == 'Ventura')
mask_3 = (train.error_type == 0) & (train.fips == 'Los Angeles')
mask_4 = (train.error_type == 0) & (train.fips != 'Los Angeles')

train_1 = train[mask_1]
train_2 = train[mask_2]
train_3 = train[mask_3]
train_4 = train[mask_4]

train_1.shape, train_2.shape, train_3.shape, train_4.shape

((16104, 19), (1487, 19), (8980, 19), (4457, 19))

In [12]:
train.groupby(['error_type', 'fips']).size()

error_type  fips       
0           Los Angeles     8980
            Orange          3361
            Ventura         1096
1           Los Angeles    11144
            Orange          4960
            Ventura         1487
dtype: int64

In [20]:
train_4.to_csv('train_4.csv', index=False)
pd.read_csv('train_4.csv', index_col=0).shape

(4457, 18)

In [22]:
mask_1 = (validate.error_type == 1) & (validate.fips != 'Ventura')
mask_2 = (validate.error_type == 1) & (validate.fips == 'Ventura')
mask_3 = (validate.error_type == 0) & (validate.fips == 'Los Angeles')
mask_4 = (validate.error_type == 0) & (validate.fips != 'Los Angeles')

validate_1 = validate[mask_1]
validate_2 = validate[mask_2]
validate_3 = validate[mask_3]
validate_4 = validate[mask_4]

validate_1.shape, validate_2.shape, validate_3.shape, validate_4.shape

((6917, 19), (623, 19), (3838, 19), (1921, 19))

In [26]:
validate_4.to_csv('validate_4.csv', index=False)
pd.read_csv('validate_4.csv', index_col=0).shape

(1921, 18)

In [27]:
mask_1 = (test.error_type == 1) & (test.fips != 'Ventura')
mask_2 = (test.error_type == 1) & (test.fips == 'Ventura')
mask_3 = (test.error_type == 0) & (test.fips == 'Los Angeles')
mask_4 = (test.error_type == 0) & (test.fips != 'Los Angeles')

test_1 = test[mask_1]
test_2 = test[mask_2]
test_3 = test[mask_3]
test_4 = test[mask_4]

test_1.shape, test_2.shape, test_3.shape, test_4.shape

((4066, 19), (369, 19), (2206, 19), (1182, 19))

In [32]:
test_4.to_csv('test_4.csv', index=False)
pd.read_csv('test_4.csv', index_col=0).shape

(1182, 18)