In [1]:
import numpy as np
import pandas as pd
import os

from get_db_url import get_db_url

As a Codeup data science graduate, you want to show off your skills to the Zillow data science team in hopes of getting an interview for a position you saw pop up on LinkedIn. You thought it might look impressive to build an end-to-end project in which you use some of their Kaggle data to predict property values using some of their available features; who knows, you might even do some feature engineering to blow them away. Your goal is to predict the values of single unit properties using the obervations from 2017.

# 1

Acquire bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, and fips from the zillow database for all 'Single Family Residential' properties.

In [2]:
# sql = '''
# SELECT
#     bedroomcnt,
#     bathroomcnt,
#     calculatedfinishedsquarefeet,
#     taxvaluedollarcnt,
#     yearbuilt,
#     taxamount,
#     fips,
#     propertylandusedesc
# FROM properties_2017
# JOIN propertylandusetype
#     ON propertylandusetype.propertylandusetypeid = properties_2017.propertylandusetypeid
#     AND propertylandusetype.propertylandusedesc = 'Single Family Residential';
# '''

# properties = pd.read_sql(sql, get_db_url('zillow'))
# properties.head(2)

properties = pd.read_csv('zillow.csv')

In [3]:
properties.shape

(2152863, 8)

In [4]:
properties.describe()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
count,2152852.0,2152852.0,2144379.0,2152370.0,2143526.0,2148421.0,2152863.0
mean,3.287196,2.230688,1862.855,461896.2,1960.95,5634.866,6048.377
std,0.9547544,0.9992796,1222.125,699676.0,22.1622,8178.91,20.43329
min,0.0,0.0,1.0,1.0,1801.0,1.85,6037.0
25%,3.0,2.0,1257.0,188170.2,1949.0,2534.98,6037.0
50%,3.0,2.0,1623.0,327671.0,1958.0,4108.95,6037.0
75%,4.0,3.0,2208.0,534527.0,1976.0,6414.32,6059.0
max,25.0,32.0,952576.0,98428910.0,2016.0,1337756.0,6111.0


In [5]:
properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152863 entries, 0 to 2152862
Data columns (total 8 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     float64
 5   taxamount                     float64
 6   fips                          float64
 7   propertylandusedesc           object 
dtypes: float64(7), object(1)
memory usage: 131.4+ MB


In [6]:
properties.isnull().sum()

bedroomcnt                        11
bathroomcnt                       11
calculatedfinishedsquarefeet    8484
taxvaluedollarcnt                493
yearbuilt                       9337
taxamount                       4442
fips                               0
propertylandusedesc                0
dtype: int64

In [7]:
# Let's cache this because that query took a while to run
# properties.to_csv('zillow.csv', index = False)

# 2

Using your acquired Zillow data, walk through the summarization and cleaning steps in your wrangle.ipynb file like we did above. You may handle the missing values however you feel is appropriate and meaninful; remember to document your process and decisions using markdown and code commenting where helpful.

In [8]:
# We have several columns with missing values. Let's see the proportions of these missing values per columns.
properties.isnull().mean()

bedroomcnt                      0.000005
bathroomcnt                     0.000005
calculatedfinishedsquarefeet    0.003941
taxvaluedollarcnt               0.000229
yearbuilt                       0.004337
taxamount                       0.002063
fips                            0.000000
propertylandusedesc             0.000000
dtype: float64

In [9]:
# The missing values make up only a small percentage of the whole in each column. Let's how many rows have missing values.
(properties.isnull().sum(axis = 1) != 0).sum(), (properties.isnull().sum(axis = 1) != 0).mean()

(12628, 0.005865677472277613)

Roughly half a percent of the rows comprises of missing values. Let's try to get a little more insight here.

In [10]:
# How many rows have 2 or more missing values?
(properties.isnull().sum(axis = 1) >= 2).sum(), (properties.isnull().sum(axis = 1) >= 2).mean()

(8043, 0.0037359553301812517)

In [11]:
# What's our distribution of missing values when we look at rows with 1 missing value or less?
rows_with_one_or_less_missing_values = properties.isnull().sum(axis = 1) < 2
properties[rows_with_one_or_less_missing_values].isnull().sum()

bedroomcnt                         0
bathroomcnt                        0
calculatedfinishedsquarefeet     560
taxvaluedollarcnt                 18
yearbuilt                       1439
taxamount                       2568
fips                               0
propertylandusedesc                0
dtype: int64

We will need square footage for the first iteration of our model so we will remove all rows missing that value as well as any rows missing more than one value. We'll also remove rows missing taxvaluedollarcnt since this is our target variable. We will impute everything else.

In [12]:
# Remove rows with two or more missing values.
properties = properties[rows_with_one_or_less_missing_values]

# Remove rows missing square footage
rows_not_missing_square_feet = properties.calculatedfinishedsquarefeet.notnull()
properties = properties[rows_not_missing_square_feet]

# Remove rows missing taxvaluedollarcnt
rows_not_missing_taxvalue = properties.taxvaluedollarcnt.notnull()
properties = properties[rows_not_missing_taxvalue]

In [13]:
properties.isnull().sum()

bedroomcnt                         0
bathroomcnt                        0
calculatedfinishedsquarefeet       0
taxvaluedollarcnt                  0
yearbuilt                       1439
taxamount                       2568
fips                               0
propertylandusedesc                0
dtype: int64

In [14]:
# We'll fill in yearbuilt with the mode.
properties.yearbuilt.mode()[0]

1955.0

In [15]:
properties.yearbuilt.fillna(properties.yearbuilt.mode()[0], inplace = True)

# Let's also cast the column to int.
properties.yearbuilt = properties.yearbuilt.astype('int')

In [16]:
properties.isnull().sum()

bedroomcnt                         0
bathroomcnt                        0
calculatedfinishedsquarefeet       0
taxvaluedollarcnt                  0
yearbuilt                          0
taxamount                       2568
fips                               0
propertylandusedesc                0
dtype: int64

In [17]:
properties.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2144242 entries, 4 to 2152862
Data columns (total 8 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     int64  
 5   taxamount                     float64
 6   fips                          float64
 7   propertylandusedesc           object 
dtypes: float64(6), int64(1), object(1)
memory usage: 147.2+ MB


In [18]:
# Finally let's fill in taxamount with the mean.
properties.taxamount.mean()

5616.12711389353

In [19]:
properties.taxamount.fillna(properties.taxamount.mean(), inplace = True)

In [20]:
properties.isnull().sum()

bedroomcnt                      0
bathroomcnt                     0
calculatedfinishedsquarefeet    0
taxvaluedollarcnt               0
yearbuilt                       0
taxamount                       0
fips                            0
propertylandusedesc             0
dtype: int64

In [21]:
properties.shape

(2144242, 8)

All done!

# 3

Store all of the necessary functions to automate your process from acquiring the data to returning a cleaned dataframe witn no missing values in your wrangle.py file. Name your final function wrangle_zillow.

In [22]:
# These are the acquisition functions

def get_zillow_data(use_cache: bool = True) -> pd.core.frame.DataFrame:
    '''
        Return a dataframe containing data from the zillow properties dataset.

        If a zillow.csv file containing the data does not already
        exist the data will be cached in that file inside the current
        working directory. Otherwise, the data will be read from the
        .csv file.

        Parameters
        ----------
        use_cache: bool, default True
            If True the dataset will be retrieved from a csv file if one
            exists, otherwise, it will be retrieved from the MySQL database. 
            If False the dataset will be retrieved from the MySQL database
            even if the csv file exists.

        Returns
        -------
        DataFrame: A Pandas DataFrame containing the data from the zillow
            dataset is returned.
    '''

    # If the file is cached, read from the .csv file
    if os.path.exists('zillow.csv') and use_cache:
        return pd.read_csv('zillow.csv')
    
    # Otherwise read from the mysql database
    else:
        df = pd.read_sql(_get_zillow_sql(), get_db_url('zillow'))
        df.to_csv('zillow.csv', index = False)
        return df
    
def _get_zillow_sql() -> str:
    '''
        Returns the SQL code required to retrieve the zillow dataset
        from the MySQL database.
    '''

    return '''
        SELECT
            bedroomcnt,
            bathroomcnt,
            calculatedfinishedsquarefeet,
            taxvaluedollarcnt,
            yearbuilt,
            taxamount,
            fips,
            propertylandusedesc
        FROM properties_2017
        JOIN propertylandusetype
            ON propertylandusetype.propertylandusetypeid = properties_2017.propertylandusetypeid
            AND propertylandusetype.propertylandusedesc = 'Single Family Residential';
    '''

In [27]:
# This is the prepare function

def prepare_zillow_data(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    '''
        Returns a prepared zillow dataset with all missing values handled.
        
        Parameters
        ----------
        df: DataFrame
            A pandas dataframe containing the unprepared zillow dataset.
        
        Returns
        -------
        DataFrame: A pandas dataframe containing the prepared zillow dataset.
    '''
    
    # Remove rows with two or more missing values.
    rows_with_one_or_less_missing_values = df.isnull().sum(axis = 1) < 2
    df = df[rows_with_one_or_less_missing_values]

    # Remove rows missing square footage
    rows_not_missing_square_feet = df.calculatedfinishedsquarefeet.notnull()
    df = df[rows_not_missing_square_feet]

    # Remove rows missing taxvaluedollarcnt
    rows_not_missing_taxvalue = df.taxvaluedollarcnt.notnull()
    df = df[rows_not_missing_taxvalue]
    
    # Fill in yearbuilt column with the mode and cast to int
    df.yearbuilt.fillna(df.yearbuilt.mode()[0], inplace = True)
    df.yearbuilt = df.yearbuilt.astype('int')
    
    # Fill in the taxamount column with the mean
    df.taxamount.fillna(df.taxamount.mean(), inplace = True)
    
    return df

In [28]:
# Now let's bring it all together

def wrangle_zillow() -> pd.core.frame.DataFrame:
    '''
        Returns the acquired and prepared zillow dataset.
        
        Returns
        -------
        DataFrame: A pandas dataframe containing the prepared zillow dataset.
    '''
    
    return prepare_zillow_data(get_zillow_data())

In [29]:
# Let's test it
df = wrangle_zillow()

In [32]:
df.info(show_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2144242 entries, 4 to 2152862
Data columns (total 8 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   bedroomcnt                    2144242 non-null  float64
 1   bathroomcnt                   2144242 non-null  float64
 2   calculatedfinishedsquarefeet  2144242 non-null  float64
 3   taxvaluedollarcnt             2144242 non-null  float64
 4   yearbuilt                     2144242 non-null  int64  
 5   taxamount                     2144242 non-null  float64
 6   fips                          2144242 non-null  float64
 7   propertylandusedesc           2144242 non-null  object 
dtypes: float64(6), int64(1), object(1)
memory usage: 147.2+ MB
