In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
import acquire

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
zillow = acquire.get_zillow_data()

In [4]:
# Get a peak of the dataframe
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77574 entries, 0 to 77573
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77574 non-null  int64  
 1   parcelid                      77574 non-null  int64  
 2   airconditioningtypeid         25006 non-null  float64
 3   architecturalstyletypeid      206 non-null    float64
 4   basementsqft                  50 non-null     float64
 5   bathroomcnt                   77574 non-null  float64
 6   bedroomcnt                    77574 non-null  float64
 7   buildingclasstypeid           15 non-null     float64
 8   buildingqualitytypeid         49808 non-null  float64
 9   calculatedbathnbr             76959 non-null  float64
 10  decktypeid                    614 non-null    float64
 11  finishedfloor1squarefeet      6035 non-null   float64
 12  calculatedfinishedsquarefeet  77374 non-null  float64
 13  f

In [11]:
# Create a function that will remove rows and columns that have missing values past a certain threshold.
def handle_missing_values(df, p_row = 0.84, p_col = 0.84):
    ''' function which takes in a dataframe, required notnull proportions of non-null rows and columns.
    drop the columns and rows columns based on theshold:'''
    
    #drop columns with nulls
    threshold = int(p_col * len(df.index)) # Require that many non-NA values.
    df.dropna(axis = 1, thresh = threshold, inplace = True)
    
    #drop rows with nulls
    threshold = int(p_row * len(df.columns)) # Require that many non-NA values.
    df.dropna(axis = 0, thresh = threshold, inplace = True)
    
    
    return df

In [12]:
b = handle_missing_values(b)

In [41]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77474 entries, 0 to 77573
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     77474 non-null  int64  
 1   parcelid               77474 non-null  int64  
 2   bathroomcnt            77474 non-null  float64
 3   bedroomcnt             77474 non-null  float64
 4   fips                   77474 non-null  float64
 5   latitude               77474 non-null  float64
 6   longitude              77474 non-null  float64
 7   lotsizesquarefeet      77474 non-null  float64
 8   roomcnt                77474 non-null  float64
 9   yearbuilt              77474 non-null  float64
 10  taxvaluedollarcnt      77474 non-null  float64
 11  assessmentyear         77474 non-null  float64
 12  landtaxvaluedollarcnt  77474 non-null  float64
 13  taxamount              77474 non-null  float64
 14  logerror               77474 non-null  float64
 15  tr

In [15]:
# Take a look at the amount of missing values.
b.isnull().sum()

id                                 0
parcelid                           0
bathroomcnt                        0
bedroomcnt                         0
calculatedbathnbr                515
calculatedfinishedsquarefeet     101
finishedsquarefeet12            3555
fips                               0
fullbathcnt                      515
latitude                           0
longitude                          0
lotsizesquarefeet               8174
propertycountylandusecode          0
propertylandusetypeid              0
rawcensustractandblock             0
regionidcity                    1460
regionidcounty                     0
regionidzip                       45
roomcnt                            0
yearbuilt                        169
structuretaxvaluedollarcnt        99
taxvaluedollarcnt                  1
assessmentyear                     0
landtaxvaluedollarcnt              1
taxamount                          5
censustractandblock              226
logerror                           0
t

In [23]:
# Create a list of columns to drop.
columns_to_drop = ['calculatedbathnbr','calculatedfinishedsquarefeet','finishedsquarefeet12','fullbathcnt','propertycountylandusecode','propertylandusetypeid','rawcensustractandblock','regionidcity','regionidcounty','regionidzip','structuretaxvaluedollarcnt','censustractandblock','propertylandusedesc']

In [20]:
def drop_columns(df, drop_col):
    df = df.drop(columns=drop_col)
    return df

In [25]:
b = drop_columns(b, columns_to_drop)

In [28]:
b.isna().sum()

id                          0
parcelid                    0
bathroomcnt                 0
bedroomcnt                  0
fips                        0
latitude                    0
longitude                   0
lotsizesquarefeet        8174
roomcnt                     0
yearbuilt                 169
taxvaluedollarcnt           1
assessmentyear              0
landtaxvaluedollarcnt       1
taxamount                   5
logerror                    0
transactiondate             0
dtype: int64

In [None]:
# Fill all missing values with the mean of the column.
df['embark_town'] = df.embark_town.fillna(value='Southampton')

In [39]:
for col in b.columns:
    if b[col].isna().sum() > 0:
        b[col] = b[col].fillna(value = b[col].mean())
        print(b[col])

0          4506.0
1         12647.0
2          8432.0
3         13038.0
4        278581.0
           ...   
77569     59487.0
77570     47405.0
77571     12105.0
77572      5074.0
77573      6347.0
Name: lotsizesquarefeet, Length: 77474, dtype: float64
0        1998.0
1        1967.0
2        1962.0
3        1970.0
4        1964.0
          ...  
77569    1980.0
77570    1940.0
77571    1964.0
77572    1954.0
77573    1955.0
Name: yearbuilt, Length: 77474, dtype: float64
0        1023282.0
1         464000.0
2         564778.0
3         145143.0
4         119407.0
           ...    
77569     379000.0
77570     354621.0
77571      67205.0
77572      49546.0
77573     522000.0
Name: taxvaluedollarcnt, Length: 77474, dtype: float64
0        537569.0
1        376000.0
2        479489.0
3         36225.0
4         45726.0
           ...   
77569    114000.0
77570    283704.0
77571     16522.0
77572     16749.0
77573    382000.0
Name: landtaxvaluedollarcnt, Length: 77474, dtype: float64
0  

In [40]:
b.isna().sum()

id                       0
parcelid                 0
bathroomcnt              0
bedroomcnt               0
fips                     0
latitude                 0
longitude                0
lotsizesquarefeet        0
roomcnt                  0
yearbuilt                0
taxvaluedollarcnt        0
assessmentyear           0
landtaxvaluedollarcnt    0
taxamount                0
logerror                 0
transactiondate          0
dtype: int64