In [1]:
import pandas as pd
import datetime
from geopy.geocoders import Nominatim

In [2]:
house_file_path = "assets\house_price.csv"
used_columns_list = ['Land_Sale_Price', 'Total_sale_Price', 'Deed_Date', 'Assessed_Building_Value', 'Story_Height', 'HEATED_AREA',
       'UTILITIES', 'Remodeled_Year', 'BATH', 'BATH_FIXTURES', 'TYPE_AND_USE', 'PHYSICAL_ZIP_CODE', 'PHYSICAL_CITY', 'Street_Number', 
       'Street_Name', 'Street_Type', 'Planning_Jurisdiction']
wake_cities = ['APEX', 'CARY', 'FUQUAY VARINA', 'GARNER', 'HOLLY SPRINGS', 'KNIGHTDALE', 'MORRISVILLE', 'RALEIGH', 'ROLESVILLE', 'WAKE FOREST', 'WENDELL', 'ZEBULON']

column_dropna = ['UTILITIES', 'BATH', 'Story_Height']

update_location = {'5500 BATOUL LN, raleigh': '27606',
                   '1436 INDIGO CREEK DR, zebulon': '27597',
                   '1432 INDIGO CREEK DR, zebulon': '27597',
                   '1428 INDIGO CREEK DR, zebulon': '27597',
                   '1424 INDIGO CREEK DR, zebulon': '27597',
                   '1420 INDIGO CREEK DR, zebulon': '27597',
                   '1416 INDIGO CREEK DR, zebulon': '27597',
                   '1412 INDIGO CREEK DR, zebulon': '27597',
                   '1413 INDIGO CREEK DR, zebulon': '27597',
                   '1429 INDIGO CREEK DR, zebulon': '27597',
                   '1433 INDIGO CREEK DR, zebulon': '27597',
                   '1517 INDIGO CREEK DR, zebulon': '27597',
                   '1519 INDIGO CREEK DR, zebulon': '27597',
                   '1521 INDIGO CREEK DR, zebulon': '27597',
                   '1523 INDIGO CREEK DR, zebulon': '27597',
                   '1520 INDIGO CREEK DR, zebulon': '27597',
                   '1518 INDIGO CREEK DR, zebulon': '27597',
                   '1516 INDIGO CREEK DR, zebulon': '27597',
                   '529 BASIN HILL DR, wake forest': '27587',
                   '505 BASIN HILL DR, wake forest': '27587',
                   '501 BASIN HILL DR, wake forest': '27587'}


In [3]:
def read_csv(file_path, print_columns=False):
    """Read csv files and return a dataframe
    
    Args:
        file_path (str): path of the csv file
        print_columns (bool): print columns of the dataframe (default: False)
    """
    df = pd.read_csv(file_path)
    if print_columns:
        print(df.columns)
    print(df.shape)
    print("The data is loaded successfully!")
    return df
    
def used_columns_df(df, columns):
    """Keep only the used columns
    
    Args:
        df (dataframe): dataframe
        columns (list): list of columns
    """
    new_df = df[columns].copy()
    return new_df

def format_price_value(df, column, convert_type, replace=False):
    """Convert price value to float
    
    Args:
        df (dataframe): dataframe
        column (str): column name
        convert_type (str): convert type
        replace (bool): replace the original column or not (default: False)
    """
    if replace:
        df[column] = df[column].str.replace(',', '').astype(convert_type)
    df[column] = df[column].astype(convert_type)
    print(f"{column} is converted successfully")
    return df

def format_date(df, column, errors='coerce'):
    """Convert format of date
    
    Args:
        df (dataframe): dataframe
        column (str): column name
        errors (str): errors (default: 'coerce')
    """
    df[column] = pd.to_datetime(df[column], errors=errors)
    print(f"{column} is converted successfully")
    return df

def fill_drop_na(df, column, fill_zero=True):
    """Fill or drop na values
    
    Args:
        df (dataframe): dataframe
        column (str): column name
        fill_zero (bool): fill na as zero or not (default: True)
    """
    if column == 'Deed_Date':
        df.loc[df[column].isnull(), column] = df['Remodeled_Year']
    elif column in column_dropna:
        df.dropna(subset=[column], inplace=True)
        df.reset_index(drop=True, inplace=True)
        print(f"None value in {column} is dropped successfully")
        print("Index of the dataframe is reset successfully")
        return df
    else:
        if fill_zero:
            df[column] = df[column].fillna(0).astype(int)
    print(f"Number of nan values of {column} is {df[column].isnull().sum()}")
    return df

def remove_zero(df, column):
    """Remove zero values
    
    Args:
        df (dataframe): dataframe
        column (str): column name
    """
    df = df[df[column] != 0]
    print(f"Number of zero values of {column} is {df[column].eq(0).sum()}")
    return df

def convert_categorical_to_numeric_variables(df, variable):
    """Convert categorical variables to numeric variables
    
    Args:
        df (dataframe): dataframe
        variable (str): variable name
    """
    if variable == 'BATH':
        df.loc[df['BATH'] == 'A', 'BATH'] = 1
        df.loc[df['BATH'] == 'B', 'BATH'] = 1.5
        df.loc[df['BATH'] == 'C', 'BATH'] = 2
        df.loc[df['BATH'] == 'D', 'BATH'] = 2.5
        df.loc[df['BATH'] == 'E', 'BATH'] = 3
        df.loc[df['BATH'] == 'F', 'BATH'] = 3.5
        df.loc[df['BATH'] == 'G', 'BATH'] = 0
        df.loc[df['BATH'] == 'H', 'BATH'] = 0
        df.loc[(df['BATH'] == 'I') & (df['BATH_FIXTURES'] <= 3), 'BATH'] = 0
        df.loc[(df['BATH'] == 'I') & (df['BATH_FIXTURES'] <= 6), 'BATH'] = 1
        df.loc[(df['BATH'] == 'I') & (df['BATH_FIXTURES'] == 7), 'BATH'] = 1.5
        df.loc[(df['BATH'] == 'I') & (df['BATH_FIXTURES'] == 8), 'BATH'] = 2
        df.loc[(df['BATH'] == 'I') & (df['BATH_FIXTURES'] == 9), 'BATH'] = 2.5
        df.loc[(df['BATH'] == 'I') & (df['BATH_FIXTURES'] == 10), 'BATH'] = 3
        df.loc[(df['BATH'] == 'I') & (df['BATH_FIXTURES'] == 11), 'BATH'] = 3.5
        df.loc[(df['BATH'] == 'I') & (df['BATH_FIXTURES'] > 11), 'BATH'] = 4
        
        df.loc[(df['BATH'] == 'J') & (df['BATH_FIXTURES'] <= 3), 'BATH'] = 0
        df.loc[(df['BATH'] == 'J') & (df['BATH_FIXTURES'] <= 6), 'BATH'] = 1
        df.loc[(df['BATH'] == 'J') & (df['BATH_FIXTURES'] == 7), 'BATH'] = 1.5
        df.loc[(df['BATH'] == 'J') & (df['BATH_FIXTURES'] == 8), 'BATH'] = 2
        df.loc[(df['BATH'] == 'J') & (df['BATH_FIXTURES'] == 9), 'BATH'] = 2.5
        df.loc[(df['BATH'] == 'J') & (df['BATH_FIXTURES'] == 10), 'BATH'] = 3
        df.loc[(df['BATH'] == 'J') & (df['BATH_FIXTURES'] == 11), 'BATH'] = 3.5
        df.loc[(df['BATH'] == 'J') & (df['BATH_FIXTURES'] > 11), 'BATH'] = 4
        print("Bathroom number is converted successfully")
        return df

    elif variable == 'Story_Height':
        df.loc[df['Story_Height'] == 'A', 'Story_Height'] = 1
        df.loc[df['Story_Height'] == 'B', 'Story_Height'] = 1.5
        df.loc[df['Story_Height'] == 'C', 'Story_Height'] = 2
        df.loc[df['Story_Height'] == 'D', 'Story_Height'] = 2.5
        df.loc[df['Story_Height'] == 'E', 'Story_Height'] = 3
        df.loc[df['Story_Height'] == 'F', 'Story_Height'] = 3.5
        df.loc[df['Story_Height'] == 'G', 'Story_Height'] = 4
        df.loc[df['Story_Height'] == 'H', 'Story_Height'] = 5
        df.loc[df['Story_Height'] == 'I', 'Story_Height'] = 1.75
        df.loc[df['Story_Height'] == 'J', 'Story_Height'] = 1.4
        df.loc[df['Story_Height'] == 'K', 'Story_Height'] = 1.63
        df.loc[df['Story_Height'] == 'L', 'Story_Height'] = 1.88
        df.loc[df['Story_Height'] == 'M', 'Story_Height'] = 2.4
        df.loc[df['Story_Height'] == 'N', 'Story_Height'] = 2.63
        df.loc[df['Story_Height'] == 'O', 'Story_Height'] = 2.75
        print("Story height is converted successfully")
        return df
    
def filter_column(
                df, filter_column, 
                city_list=None, filter_date=None):
    """Filter dataframe by city and date
    
    Args:
        df (dataframe): dataframe
        filter_column (str): filter column
        city_list (list): list of cities (default: None)
        filter_date (str): filter date (default: None)
    """
    if filter_column == 'TYPE_AND_USE':
        # According to the U.S. Census Bureau, a single-family house is one that may be fully detached, semi-detached, a row house or a townhome. df.loc[df['column_name'].isin(some_values)]
        df = df.loc[df[filter_column].isin([1, 8])]
        print(f"{filter_column} is filtered successfully")
    elif filter_column == 'PHYSICAL_CITY':
        if city_list is None:
            print("Please provide city list")
            print("Stop filtering")
        else:
            df = df.drop(df[~df[filter_column].isin(city_list)].index)
            df[filter_column] = df[filter_column].str.lower()
            print(f"{filter_column} is filtered successfully")
    elif filter_column == 'Deed_Date':
        if filter_date is None:
            print("Please provide date")
            print("Stop filtering")
        else:
            df = df.loc[df[filter_column] > filter_date]
            df.drop(columns=['Remodeled_Year', ], inplace=True)
            df.reset_index(drop=True, inplace=True)
            print(f"{filter_column} is filtered successfully")
            print("Remodeled_Year is removed")
            print("Index of dataframe is reset")

    return df


In [4]:
def find_zipcode(df, update_location):
    df.loc[(df['PHYSICAL_ZIP_CODE'] == 0) & (df['Planning_Jurisdiction'] == 'CA'), 'PHYSICAL_CITY'] = 'cary'
    df.loc[(df['PHYSICAL_ZIP_CODE'] == 0) & (df['Planning_Jurisdiction'] == 'RA'), 'PHYSICAL_CITY'] = 'raleigh'
    df.loc[(df['PHYSICAL_ZIP_CODE'] == 0) & (df['Planning_Jurisdiction'] == 'WE'), 'PHYSICAL_CITY'] = 'wendell'
    df.loc[(df['PHYSICAL_ZIP_CODE'] == 0) & (df['Planning_Jurisdiction'] == 'ZB'), 'PHYSICAL_CITY'] = 'zebulon'
    df.loc[(df['PHYSICAL_ZIP_CODE'] == 0) & (df['Planning_Jurisdiction'] == 'WF'), 'PHYSICAL_CITY'] = 'wake forest'
    df['address'] = df['Street_Number'].astype(str) + " " + df['Street_Name'] + " " + \
        df['Street_Type'] + ", " + df['PHYSICAL_CITY']

    geolocator = Nominatim(user_agent="myGeocoder")

    addresses = df[df['PHYSICAL_ZIP_CODE'] == 0]['address'].tolist()
    location_dict = {}
    location_lst = []
    for address in addresses:
        location_dict[address] = None
        location = geolocator.geocode(address)
        if location != None:
            location_dict[address] = location.address.split(',')[-2]
        else: 
            print(f"{address} is not found")
    non_location = {}
    for address, zip in location_dict.items():
        if location_dict[address] is None:
            # print(address)
            non_location[address] = None
    location_dict.update(update_location)

    for key, value in location_dict.items():
        df.loc[df['address'] == key, 'PHYSICAL_ZIP_CODE'] = value
    print("ALL missing zipcodes are found successfully")
    return df

## Read file

In [5]:
# read csv file
house_price_df = read_csv(house_file_path, print_columns=False)

  df = pd.read_csv(file_path)


(432976, 87)
The data is loaded successfully!


In [6]:
updated_house_price = used_columns_df(house_price_df, used_columns_list)
updated_house_price = format_date(updated_house_price, 'Remodeled_Year')
updated_house_price = format_date(updated_house_price, 'Deed_Date')
updated_house_price = fill_drop_na(updated_house_price, 'PHYSICAL_ZIP_CODE')
updated_house_price = fill_drop_na(updated_house_price, 'HEATED_AREA')
updated_house_price = format_price_value(updated_house_price, 'Land_Sale_Price', 
                                        'float', replace=True)
updated_house_price = format_price_value(updated_house_price, 'Total_sale_Price', 
                                        'float', replace=True)
updated_house_price = format_price_value(updated_house_price, 'Assessed_Building_Value', 
                                        'float', replace=True)

Remodeled_Year is converted successfully
Deed_Date is converted successfully
Number of nan values of PHYSICAL_ZIP_CODE is 0
Number of nan values of HEATED_AREA is 0
Land_Sale_Price is converted successfully
Total_sale_Price is converted successfully
Assessed_Building_Value is converted successfully


In [7]:
updated_house_price = convert_categorical_to_numeric_variables(updated_house_price, 'BATH')
updated_house_price = convert_categorical_to_numeric_variables(updated_house_price, 'Story_Height')

updated_house_price = filter_column(updated_house_price, 'TYPE_AND_USE')
updated_house_price = filter_column(updated_house_price, 'PHYSICAL_CITY', wake_cities)
print(updated_house_price.shape)
# updated_house_price.head()

Bathroom number is converted successfully
Story height is converted successfully
TYPE_AND_USE is filtered successfully
PHYSICAL_CITY is filtered successfully
(335542, 17)


In [8]:
updated_house_price = remove_zero(updated_house_price, 'Total_sale_Price')
updated_house_price = filter_column(updated_house_price, 'Deed_Date', filter_date="2000-01-01")
updated_house_price = fill_drop_na(updated_house_price, 'UTILITIES')
updated_house_price = fill_drop_na(updated_house_price, 'BATH')
updated_house_price = fill_drop_na(updated_house_price, 'Story_Height')
print(updated_house_price.shape)
# updated_house_price

Number of zero values of Total_sale_Price is 0
Deed_Date is filtered successfully
Remodeled_Year is removed
Index of dataframe is reset
None value in UTILITIES is dropped successfully
Index of the dataframe is reset successfully
None value in BATH is dropped successfully
Index of the dataframe is reset successfully
None value in Story_Height is dropped successfully
Index of the dataframe is reset successfully
(278913, 16)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Remodeled_Year', ], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=[column], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=[column], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=[column], inplace

## Fill out zero zip code values

In [9]:
updated_house_price = find_zipcode(updated_house_price, update_location)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['address'] = df['Street_Number'].astype(str) + " " + df['Street_Name'] + " " + \


5500 BATOUL LN, raleigh is not found
1436 INDIGO CREEK DR, zebulon is not found
1432 INDIGO CREEK DR, zebulon is not found
1428 INDIGO CREEK DR, zebulon is not found
1424 INDIGO CREEK DR, zebulon is not found
1420 INDIGO CREEK DR, zebulon is not found
1416 INDIGO CREEK DR, zebulon is not found
1412 INDIGO CREEK DR, zebulon is not found
1413 INDIGO CREEK DR, zebulon is not found
1429 INDIGO CREEK DR, zebulon is not found
1433 INDIGO CREEK DR, zebulon is not found
1517 INDIGO CREEK DR, zebulon is not found
1519 INDIGO CREEK DR, zebulon is not found
1521 INDIGO CREEK DR, zebulon is not found
1523 INDIGO CREEK DR, zebulon is not found
1520 INDIGO CREEK DR, zebulon is not found
1518 INDIGO CREEK DR, zebulon is not found
1516 INDIGO CREEK DR, zebulon is not found
529 BASIN HILL DR, wake forest is not found
505 BASIN HILL DR, wake forest is not found
501 BASIN HILL DR, wake forest is not found
ALL missing zipcodes are found successfully


## Check null values

In [10]:
updated_house_price.isna().sum()

Land_Sale_Price              0
Total_sale_Price             0
Deed_Date                    0
Assessed_Building_Value      0
Story_Height                 0
HEATED_AREA                  0
UTILITIES                    0
BATH                         0
BATH_FIXTURES                0
TYPE_AND_USE                 0
PHYSICAL_ZIP_CODE            0
PHYSICAL_CITY                0
Street_Number                0
Street_Name                  0
Street_Type                929
Planning_Jurisdiction        0
address                    929
dtype: int64

## Save as a CSV file

In [11]:
updated_house_price.to_csv('assets/updated_house_price.csv')