1. Acquire data from mySQL using the python module to connect and query. You will want to end with a single dataframe. Make sure to include: the logerror, all fields related to the properties that are available. You will end up using all the tables in the database.

    - Be sure to do the correct join (inner, outer, etc.). We do not want to eliminate properties purely because they may have a null value for airconditioningtypeid.
    - Only include properties with a transaction in 2017, and include only the last transaction for each properity (so no duplicate property ID's), along with zestimate error and date of transaction.
    - Only include properties that include a latitude and longitude value.

In [15]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from util import get_db_url
import acquire
import summarize

In [16]:
df = acquire.get_zillow_data()

In [17]:
df.head()

Unnamed: 0,county,tax_rate,id,parcelid,airconditioningtypeid,airconditioningdesc,architecturalstyletypeid,architecturalstyledesc,basementsqft,bathroomcnt,...,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,typeconstructiontypeid,typeconstructiondesc,censustractandblock,transactiondate,logerror,transactions
0,Ventura,0.012225,1387261,17052889,,,,,,1.0,...,376000.0,5672.48,,,,,61110010000000.0,2017-01-01,0.055619,1
1,Ventura,0.010596,1447245,17143294,,,,,,2.0,...,132424.0,3508.1,,,,,61110050000000.0,2017-01-01,-0.020526,1
2,Ventura,0.011133,43675,17110996,,,,,,2.5,...,99028.0,2204.84,,,,,61110050000000.0,2017-01-02,0.008669,1
3,Ventura,0.012201,1327940,17153340,,,,,,3.5,...,522030.0,12738.54,,,,,61110040000000.0,2017-01-02,-0.09534,1
4,Ventura,0.014142,1431120,17153706,,,,,,2.5,...,239000.0,8442.56,,,,,61110040000000.0,2017-01-02,0.027817,1


2. Summarize your data (summary stats, info, dtypes, shape, distributions, value_counts, etc.)

In [4]:
df.shape

(77381, 72)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77381 entries, 0 to 77380
Data columns (total 72 columns):
county                          77381 non-null object
tax_rate                        77375 non-null float64
id                              77381 non-null int64
parcelid                        77381 non-null int64
airconditioningtypeid           24953 non-null float64
airconditioningdesc             24953 non-null object
architecturalstyletypeid        206 non-null float64
architecturalstyledesc          206 non-null object
basementsqft                    50 non-null float64
bathroomcnt                     77381 non-null float64
bedroomcnt                      77381 non-null float64
buildingclasstypeid             15 non-null float64
buildingclassdesc               15 non-null object
buildingqualitytypeid           49672 non-null float64
calculatedbathnbr               76772 non-null float64
calculatedfinishedsquarefeet    77185 non-null float64
decktypeid                      

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tax_rate,77375.0,0.01316904,0.005828083,9.372442e-05,0.0116599,0.01228447,0.01345336,0.81649
id,77381.0,1495139.0,860907.1,349.0,752070.0,1497932.0,2240535.0,2982274.0
parcelid,77381.0,13007150.0,3481346.0,10711860.0,11538300.0,12531570.0,14211830.0,167689300.0
airconditioningtypeid,24953.0,1.813289,2.967894,1.0,1.0,1.0,1.0,13.0
architecturalstyletypeid,206.0,7.38835,2.734542,2.0,7.0,7.0,7.0,21.0
basementsqft,50.0,679.72,689.7035,38.0,273.0,515.0,796.5,3560.0
bathroomcnt,77381.0,2.29913,0.9966507,0.0,2.0,2.0,3.0,18.0
bedroomcnt,77381.0,3.053489,1.139096,0.0,2.0,3.0,4.0,16.0
buildingclasstypeid,15.0,3.933333,0.2581989,3.0,4.0,4.0,4.0,4.0
buildingqualitytypeid,49672.0,6.534587,1.721953,1.0,6.0,6.0,8.0,12.0


In [7]:
# Looking at the datatypes of each column

In [8]:
df.dtypes

county                           object
tax_rate                        float64
id                                int64
parcelid                          int64
airconditioningtypeid           float64
airconditioningdesc              object
architecturalstyletypeid        float64
architecturalstyledesc           object
basementsqft                    float64
bathroomcnt                     float64
bedroomcnt                      float64
buildingclasstypeid             float64
buildingclassdesc                object
buildingqualitytypeid           float64
calculatedbathnbr               float64
calculatedfinishedsquarefeet    float64
decktypeid                      float64
finishedfloor1squarefeet        float64
finishedsquarefeet12            float64
finishedsquarefeet13            float64
finishedsquarefeet15            float64
finishedsquarefeet50            float64
finishedsquarefeet6             float64
fips                            float64
state                            object


In [9]:
df.id.value_counts()

657407     1
1100492    1
1029681    1
1440602    1
133844     1
2800516    1
1315543    1
420568     1
2561795    1
1211098    1
659480     1
548826     1
1589981    1
2644702    1
1682144    1
478866     1
1231284    1
2153166    1
2159307    1
2720485    1
1374922    1
1011929    1
1078297    1
2042544    1
336561     1
1127091    1
727734     1
2056889    1
942844     1
2706108    1
          ..
163109     1
2647334    1
1469735    1
357654     1
1312042    1
2231595    1
2630958    1
2837404    1
2807088    1
1408279    1
154868     1
2592027    1
1285401    1
2069784    1
736525     1
220417     1
738562     1
490757     1
201992     1
1246474    1
2700556    1
2567439    1
1699095    1
1299728    1
1170705    1
1690899    1
1570068    1
1965333    1
648470     1
2392597    1
Name: id, Length: 77381, dtype: int64

In [10]:
# Look to see if there are any nulls in each row

In [11]:
df.isnull().sum()

county                              0
tax_rate                            6
id                                  0
parcelid                            0
airconditioningtypeid           52428
airconditioningdesc             52428
architecturalstyletypeid        77175
architecturalstyledesc          77175
basementsqft                    77331
bathroomcnt                         0
bedroomcnt                          0
buildingclasstypeid             77366
buildingclassdesc               77366
buildingqualitytypeid           27709
calculatedbathnbr                 609
calculatedfinishedsquarefeet      196
decktypeid                      76767
finishedfloor1squarefeet        71358
finishedsquarefeet12             3632
finishedsquarefeet13            77340
finishedsquarefeet15            74372
finishedsquarefeet50            71358
finishedsquarefeet6             76995
fips                                0
state                               0
fireplacecnt                    69105
fullbathcnt 

In [12]:
# Look to see if there are any nulls in each column

In [13]:
df.isnull().sum(axis=1)

0        33
1        33
2        33
3        32
4        32
5        33
6        33
7        34
8        33
9        33
10       32
11       33
12       33
13       34
14       31
15       32
16       33
17       33
18       32
19       29
20       32
21       30
22       29
23       34
24       34
25       31
26       32
27       33
28       33
29       33
         ..
77351    31
77352    34
77353    29
77354    33
77355    32
77356    34
77357    34
77358    35
77359    29
77360    35
77361    35
77362    34
77363    35
77364    34
77365    37
77366    35
77367    34
77368    29
77369    37
77370    33
77371    38
77372    35
77373    37
77374    37
77375    36
77376    38
77377    32
77378    36
77379    34
77380    37
Length: 77381, dtype: int64

In [14]:
summarize.df_summary(df)

--- Shape: (77381, 72)
--- Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77381 entries, 0 to 77380
Data columns (total 72 columns):
county                          77381 non-null object
tax_rate                        77375 non-null float64
id                              77381 non-null int64
parcelid                        77381 non-null int64
airconditioningtypeid           24953 non-null float64
airconditioningdesc             24953 non-null object
architecturalstyletypeid        206 non-null float64
architecturalstyledesc          206 non-null object
basementsqft                    50 non-null float64
bathroomcnt                     77381 non-null float64
bedroomcnt                      77381 non-null float64
buildingclasstypeid             15 non-null float64
buildingclassdesc               15 non-null object
buildingqualitytypeid           49672 non-null float64
calculatedbathnbr               76772 non-null float64
calculatedfinishedsquarefeet    77185 non-null float64


   num_cols_missing    pct_cols_missing  num_rows
0                23  31.944444444444443         2
1                24   33.33333333333333        13
2                25   34.72222222222222        24
3                26   36.11111111111111        65
4                27                37.5       316
5                28   38.88888888888889       455
6                29   40.27777777777778      5270
7                30   41.66666666666667      3455
8                31   43.05555555555556      9891
9                32   44.44444444444444     12578
10               33   45.83333333333333     14783
11               34   47.22222222222222     13326
12               35   48.61111111111111      5147
13               36                50.0      5776
14               37  51.388888888888886      3620
15               38   52.77777777777778      1926
16               39  54.166666666666664       285
17               40   55.55555555555556       230
18               41   56.94444444444444        29


3. Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [21]:
number_rows = df.isnull().sum()
number_rows

county                              0
tax_rate                            6
id                                  0
parcelid                            0
airconditioningtypeid           52428
airconditioningdesc             52428
architecturalstyletypeid        77175
architecturalstyledesc          77175
basementsqft                    77331
bathroomcnt                         0
bedroomcnt                          0
buildingclasstypeid             77366
buildingclassdesc               77366
buildingqualitytypeid           27709
calculatedbathnbr                 609
calculatedfinishedsquarefeet      196
decktypeid                      76767
finishedfloor1squarefeet        71358
finishedsquarefeet12             3632
finishedsquarefeet13            77340
finishedsquarefeet15            74372
finishedsquarefeet50            71358
finishedsquarefeet6             76995
fips                                0
state                               0
fireplacecnt                    69105
fullbathcnt 

In [22]:
rows = df.shape[0]
rows

77381

In [24]:
pct_missing = number_rows/rows
pct_missing

county                          0.000000
tax_rate                        0.000078
id                              0.000000
parcelid                        0.000000
airconditioningtypeid           0.677531
airconditioningdesc             0.677531
architecturalstyletypeid        0.997338
architecturalstyledesc          0.997338
basementsqft                    0.999354
bathroomcnt                     0.000000
bedroomcnt                      0.000000
buildingclasstypeid             0.999806
buildingclassdesc               0.999806
buildingqualitytypeid           0.358085
calculatedbathnbr               0.007870
calculatedfinishedsquarefeet    0.002533
decktypeid                      0.992065
finishedfloor1squarefeet        0.922164
finishedsquarefeet12            0.046937
finishedsquarefeet13            0.999470
finishedsquarefeet15            0.961114
finishedsquarefeet50            0.922164
finishedsquarefeet6             0.995012
fips                            0.000000
state           

In [25]:
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    pct_missing = num_missing/rows
    cols_missing = pd.DataFrame({'num_rows_missing': num_missing, 'pct_rows_missing': pct_missing})
    return cols_missing

In [26]:
nulls_by_col(df)

Unnamed: 0,num_rows_missing,pct_rows_missing
county,0,0.000000
tax_rate,6,0.000078
id,0,0.000000
parcelid,0,0.000000
airconditioningtypeid,52428,0.677531
airconditioningdesc,52428,0.677531
architecturalstyletypeid,77175,0.997338
architecturalstyledesc,77175,0.997338
basementsqft,77331,0.999354
bathroomcnt,0,0.000000


In [28]:
num_cols_missing = df.isnull().sum(axis=1)
num_cols_missing

0        33
1        33
2        33
3        32
4        32
5        33
6        33
7        34
8        33
9        33
10       32
11       33
12       33
13       34
14       31
15       32
16       33
17       33
18       32
19       29
20       32
21       30
22       29
23       34
24       34
25       31
26       32
27       33
28       33
29       33
         ..
77351    31
77352    34
77353    29
77354    33
77355    32
77356    34
77357    34
77358    35
77359    29
77360    35
77361    35
77362    34
77363    35
77364    34
77365    37
77366    35
77367    34
77368    29
77369    37
77370    33
77371    38
77372    35
77373    37
77374    37
77375    36
77376    38
77377    32
77378    36
77379    34
77380    37
Length: 77381, dtype: int64

In [29]:
pct_cols_missing = df.isnull().sum(axis=1)/df.shape[1]*100
pct_cols_missing

0        45.833333
1        45.833333
2        45.833333
3        44.444444
4        44.444444
5        45.833333
6        45.833333
7        47.222222
8        45.833333
9        45.833333
10       44.444444
11       45.833333
12       45.833333
13       47.222222
14       43.055556
15       44.444444
16       45.833333
17       45.833333
18       44.444444
19       40.277778
20       44.444444
21       41.666667
22       40.277778
23       47.222222
24       47.222222
25       43.055556
26       44.444444
27       45.833333
28       45.833333
29       45.833333
           ...    
77351    43.055556
77352    47.222222
77353    40.277778
77354    45.833333
77355    44.444444
77356    47.222222
77357    47.222222
77358    48.611111
77359    40.277778
77360    48.611111
77361    48.611111
77362    47.222222
77363    48.611111
77364    47.222222
77365    51.388889
77366    48.611111
77367    47.222222
77368    40.277778
77369    51.388889
77370    45.833333
77371    52.777778
77372    48.

In [30]:
def nulls_by_row(df):
    num_cols_missing = df.isnull().sum(axis=1)
    pct_cols_missing = df.isnull().sum(axis=1)/df.shape[1]*100
    rows_missing = pd.DataFrame({'num_cols_missing': num_cols_missing, 'pct_cols_missing': pct_cols_missing}).reset_index().groupby(['num_cols_missing','pct_cols_missing']).count().rename(index=str, columns={'index': 'num_rows'}).reset_index()
    return rows_missing

In [31]:
nulls_by_row(df)

Unnamed: 0,num_cols_missing,pct_cols_missing,num_rows
0,23,31.944444444444443,2
1,24,33.33333333333333,13
2,25,34.72222222222222,24
3,26,36.11111111111111,65
4,27,37.5,316
5,28,38.88888888888889,455
6,29,40.27777777777778,5270
7,30,41.66666666666667,3455
8,31,43.05555555555556,9891
9,32,44.44444444444444,12578
