In [1]:
#basic imports
import pandas as pd
import numpy as np
import os

# env import
from env import host, user, password

In [2]:
def get_connection(db, username=user, host=host, password=password):
    '''
    Creates a connection URL
    '''
    return f'mysql+pymysql://{username}:{password}@{host}/{db}'
    
def new_zillow_data():
    '''
    Returns zillow into a dataframe
    '''
    sql_query = '''  SELECT *
    FROM properties_2017
    LEFT OUTER JOIN airconditioningtype 
    USING (airconditioningtypeid)
    LEFT OUTER JOIN architecturalstyletype
    USING (architecturalstyletypeid)
    LEFT OUTER JOIN buildingclasstype 
    USING (buildingclasstypeid)
    LEFT OUTER JOIN heatingorsystemtype
    USING (heatingorsystemtypeid)
    LEFT OUTER JOIN predictions_2017
    USING (id)
    INNER JOIN (
    SELECT id, MAX(transactiondate) as last_trans_date 
    FROM predictions_2017
    GROUP BY id
    ) predictions ON predictions.id = properties_2017.id AND predictions_2017.transactiondate = predictions.last_trans_date
    LEFT OUTER JOIN propertylandusetype
    USING(propertylandusetypeid)
    LEFT OUTER JOIN storytype
    ON storytype.storytypeid = properties_2017.storytypeid
    LEFT OUTER JOIN typeconstructiontype
    ON typeconstructiontype.typeconstructiontypeid = properties_2017.typeconstructiontypeid
    JOIN unique_properties
    ON unique_properties.parcelid = properties_2017.parcelid
    WHERE latitude IS NOT NULL and longitude IS NOT NULL; '''
    df = pd.read_sql(sql_query, get_connection('zillow'))
    return df 

def get_zillow_data():
    '''get connection, returns zillow into a dataframe and creates a csv for us'''
    if os.path.isfile('zillow.csv'):
        df = pd.read_csv('zillow.csv', index_col=0)
    else:
        df = new_zillow_data()
        df.to_csv('zillow.csv')
    return df

def drop_nulls(df, prop_req_col = .5 , prop_req_row = .5, inplace = True):
    threshold = int(prop_req_col * len(df.index)) 
    df.dropna(axis = 1, thresh = threshold, inplace = True)
    threshold = int(prop_req_row * len(df.columns)) 
    df.dropna(axis = 0, thresh = threshold, inplace = True)
    return df

def remove_outliers(df, k, col_list):
    ''' remove outliers from a list of columns in a dataframe 
        and returns that dataframe
    '''
    
    for col in col_list:

        q1, q3 = df[f'{col}'].quantile([.25, .75])  # get quartiles
        
        iqr = q3 - q1   # calculate interquartile range
        
        upper_bound = q3 + k * iqr   # get upper bound
        lower_bound = q1 - k * iqr   # get lower bound

        # return dataframe without outliers
        
        return df[(df[f'{col}'] > lower_bound) & (df[f'{col}'] < upper_bound)]
    
#def prep_zillow(df):
#'''Removes all outlieirs from the function via remove_outliers and remove_outliers_2,
#drops all irelevant columns, drops items from column and rows with less than 50% value. Fills remaining null values, 
#Drops duplicated columns brought in from MySQL. '''
    
    # brings in data from sql or csv file
    #df = get_zillow_data()
    
    # drops duplicated columns from MySql
    #df = df.loc[:,~df.columns.duplicated()]
    
    # Ensures we are only bringing in single use properties
    #single_use_codes = [261, 262, 263, 264, 265, 268, 273,275, 276, 279]
    #df = df[df['propertylandusetypeid'].isin(single_use_codes)]
    
    #Drops null rows and columns that have less than have more nulls than threshhold (50%)
    #df = drop_nulls(df, prop_req_col = .5 , prop_req_row = .5, inplace = True)
    
    #Drops columns I have deemed irellivant ( insert names here)
    #dropcols = ['id','propertycountylandusecode','rawcensustractandblock','unitcnt']

In [3]:
df = get_zillow_data()

OperationalError: (pymysql.err.OperationalError) (2013, 'Lost connection to MySQL server during query ([Errno 54] Connection reset by peer)')
(Background on this error at: http://sqlalche.me/e/14/e3q8)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df = df.loc[:,~df.columns.duplicated()]

In [None]:
df.shape

In [None]:
df.head()

In [None]:
single_use_codes = [261, 262, 263, 264, 265, 268, 273,275, 276, 279]
df = df[df['propertylandusetypeid'].isin(single_use_codes)]

In [None]:
df.shape

In [None]:
df = drop_nulls(df, prop_req_col = .5 , prop_req_row = .5, inplace = True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
dropcols = ['id','propertycountylandusecode','rawcensustractandblock','unitcnt']

In [None]:
df.drop(columns=dropcols)

In [None]:
df.shape

In [None]:
df = remove_outliers(df, 1.5, ['calculatedfinishedsquarefeet', 'bedroomcnt', 'bathroomcnt'])

In [None]:
df.shape