In [1]:
import pandas as pd
import datetime

In [2]:
house_file_path = "assets\house_price.csv"
used_columns_list = ['Land_Sale_Price', 'Total_sale_Price', 'Deed_Date', 'Assessed_Building_Value', 'Story_Height', 'HEATED_AREA',
       'UTILITIES', 'Remodeled_Year', 'BATH', 'BATH_FIXTURES', 'TYPE_AND_USE', 'PHYSICAL_ZIP_CODE', 'PHYSICAL_CITY']
wake_cities = ['APEX', 'CARY', 'FUQUAY VARINA', 'GARNER', 'HOLLY SPRINGS', 'KNIGHTDALE', 'MORRISVILLE', 'RALEIGH', 'ROLESVILLE', 'WAKE FOREST', 'WENDELL', 'ZEBULON']

In [3]:
def read_csv(file_path, print_columns=False):
    df = pd.read_csv(file_path)
    if print_columns:
        print(df.columns)
    print(df.shape)
    print("The data is loaded successfully!")
    return df
    
def used_columns_df(df, columns):
    new_df = df[columns].copy()
    return new_df

def format_price_value(df, column, convert_type, replace=False):
    if replace:
        df[column] = df[column].str.replace(',', '').astype(convert_type)
    df[column] = df[column].astype(convert_type)
    print(f"{column} is converted successfully")
    return df

def format_date(df, column, errors='coerce'):
    df[column] = pd.to_datetime(df[column], errors=errors)
    print(f"{column} is converted successfully")
    return df

def fill_drop_na(df, column, fill_zero=True):
    if column == 'Deed_Date':
        df.loc[df[column].isnull(), column] = df['Remodeled_Year']
    elif column in ['UTILITIES', 'BATH', 'Story_Height']:
        df.dropna(subset=[column], inplace=True)
        df.reset_index(drop=True, inplace=True)
        print(f"None value in {column} is dropped successfully")
        print("Index of the dataframe is reset successfully")
        return df
    else:
        if fill_zero:
            df[column] = df[column].fillna(0).astype(int)
    print(f"Number of nan values of {column} is {df[column].isnull().sum()}")
    return df

def remove_zero(df, column):
    df = df[df[column] != 0]
    print(f"Number of zero values of {column} is {df[column].eq(0).sum()}")
    return df

def convert_categorical_to_numeric_variables(df, variable):

    if variable == 'BATH':
        df.loc[df['BATH'] == 'A', 'BATH'] = 1
        df.loc[df['BATH'] == 'B', 'BATH'] = 1.5
        df.loc[df['BATH'] == 'C', 'BATH'] = 2
        df.loc[df['BATH'] == 'D', 'BATH'] = 2.5
        df.loc[df['BATH'] == 'E', 'BATH'] = 3
        df.loc[df['BATH'] == 'F', 'BATH'] = 3.5
        df.loc[df['BATH'] == 'G', 'BATH'] = 0
        df.loc[df['BATH'] == 'H', 'BATH'] = 0
        df.loc[df['BATH'] == 'I', 'BATH'] = 1
        df.loc[df['BATH'] == 'J', 'BATH'] = 4
        print("Bathroom number is converted successfully")
        return df

    elif variable == 'Story_Height':
        df.loc[df['Story_Height'] == 'A', 'Story_Height'] = 1
        df.loc[df['Story_Height'] == 'B', 'Story_Height'] = 1.5
        df.loc[df['Story_Height'] == 'C', 'Story_Height'] = 2
        df.loc[df['Story_Height'] == 'D', 'Story_Height'] = 2.5
        df.loc[df['Story_Height'] == 'E', 'Story_Height'] = 3
        df.loc[df['Story_Height'] == 'F', 'Story_Height'] = 3.5
        df.loc[df['Story_Height'] == 'G', 'Story_Height'] = 4
        df.loc[df['Story_Height'] == 'H', 'Story_Height'] = 5
        df.loc[df['Story_Height'] == 'I', 'Story_Height'] = 1.75
        df.loc[df['Story_Height'] == 'J', 'Story_Height'] = 1.4
        df.loc[df['Story_Height'] == 'K', 'Story_Height'] = 1.63
        df.loc[df['Story_Height'] == 'L', 'Story_Height'] = 1.88
        df.loc[df['Story_Height'] == 'M', 'Story_Height'] = 2.4
        df.loc[df['Story_Height'] == 'N', 'Story_Height'] = 2.63
        df.loc[df['Story_Height'] == 'O', 'Story_Height'] = 2.75
        print("Story height is converted successfully")
        return df
    
def filter_column(df, filter_column, city_list=None, filter_date=None):
    if filter_column == 'TYPE_AND_USE':
        # According to the U.S. Census Bureau, a single-family house is one that may be fully detached, semi-detached, a row house or a townhome. df.loc[df['column_name'].isin(some_values)]
        df = df.loc[df[filter_column].isin([1, 8])]
        print(f"{filter_column} is filtered successfully")
        return df
    elif filter_column == 'PHYSICAL_CITY':
        if city_list is None:
            print("Please provide city list")
            print("Stop filtering")
            return df
        else:
            df = df.drop(df[~df[filter_column].isin(city_list)].index)
            df[filter_column] = df[filter_column].str.lower()
            print(f"{filter_column} is filtered successfully")
            return df
    elif filter_column == 'Deed_Date':
        if filter_date is None:
            print("Please provide date")
            print("Stop filtering")
            return df
        else:
            df = df.loc[df[filter_column] > filter_date]
            df.drop(columns=['Remodeled_Year', ], inplace=True)
            df.reset_index(drop=True, inplace=True)
            print(f"{filter_column} is filtered successfully")
            print("Remodeled_Year is removed")
            print("Index of dataframe is reset")
            return df


In [4]:
# read csv file
house_price_df = read_csv(house_file_path, print_columns=False)

  house_price_df = read_csv(house_file_path, print_columns=False)


(432976, 87)
The data is loaded successfully!


In [5]:
updated_house_price = used_columns_df(house_price_df, used_columns_list)
updated_house_price = format_date(updated_house_price, 'Remodeled_Year')
updated_house_price = format_date(updated_house_price, 'Deed_Date')
updated_house_price = fill_drop_na(updated_house_price, 'PHYSICAL_ZIP_CODE')
updated_house_price = fill_drop_na(updated_house_price, 'HEATED_AREA')
updated_house_price = format_price_value(updated_house_price, 'Land_Sale_Price', 'float', replace=True)
updated_house_price = format_price_value(updated_house_price, 'Total_sale_Price', 'float', replace=True)
updated_house_price = format_price_value(updated_house_price, 'Assessed_Building_Value', 'float', replace=True)

Remodeled_Year is converted successfully
Deed_Date is converted successfully
Number of nan values of PHYSICAL_ZIP_CODE is 0
Number of nan values of HEATED_AREA is 0
Land_Sale_Price is converted successfully
Total_sale_Price is converted successfully
Assessed_Building_Value is converted successfully


In [6]:
updated_house_price = convert_categorical_to_numeric_variables(updated_house_price, 'BATH')
updated_house_price = convert_categorical_to_numeric_variables(updated_house_price, 'Story_Height')

updated_house_price = filter_column(updated_house_price, 'TYPE_AND_USE')
updated_house_price = filter_column(updated_house_price, 'PHYSICAL_CITY', wake_cities)
print(updated_house_price.shape)
updated_house_price.head()

Bathroom number is converted successfully
Story height is converted successfully
TYPE_AND_USE is filtered successfully
PHYSICAL_CITY is filtered successfully
(335542, 13)


Unnamed: 0,Land_Sale_Price,Total_sale_Price,Deed_Date,Assessed_Building_Value,Story_Height,HEATED_AREA,UTILITIES,Remodeled_Year,BATH,BATH_FIXTURES,TYPE_AND_USE,PHYSICAL_ZIP_CODE,PHYSICAL_CITY
8,0.0,34500.0,1974-01-01,134321.0,1,1828,ALL,1970-01-01,2,0,1.0,27610,raleigh
9,0.0,35500.0,1993-10-28,114933.0,1,1240,E,1970-01-01,1,0,1.0,27610,raleigh
10,28000.0,0.0,2010-05-26,132624.0,1,1037,ALL,1970-01-01,2,0,1.0,27606,raleigh
11,0.0,37500.0,2004-09-16,118723.0,1,2261,WSE,1970-01-01,2,0,1.0,27591,wendell
14,0.0,70000.0,1971-01-01,358290.0,1,3770,WGE,1970-01-01,4,8,1.0,27613,raleigh


In [7]:
updated_house_price = remove_zero(updated_house_price, 'Total_sale_Price')
updated_house_price = filter_column(updated_house_price, 'Deed_Date', filter_date="2000-01-01")
updated_house_price = fill_drop_na(updated_house_price, 'UTILITIES')
updated_house_price = fill_drop_na(updated_house_price, 'BATH')
updated_house_price = fill_drop_na(updated_house_price, 'Story_Height')
print(updated_house_price.shape)
updated_house_price

Number of zero values of Total_sale_Price is 0
Deed_Date is filtered successfully
Remodeled_Year is removed
Index of dataframe is reset
None value in UTILITIES is dropped successfully
Index of the dataframe is reset successfully
None value in BATH is dropped successfully
Index of the dataframe is reset successfully
None value in Story_Height is dropped successfully
Index of the dataframe is reset successfully
(278913, 12)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Land_Sale_Price,Total_sale_Price,Deed_Date,Assessed_Building_Value,Story_Height,HEATED_AREA,UTILITIES,BATH,BATH_FIXTURES,TYPE_AND_USE,PHYSICAL_ZIP_CODE,PHYSICAL_CITY
0,0.0,37500.0,2004-09-16,118723.0,1,2261,WSE,2,0,1.0,27591,wendell
1,0.0,380000.0,2015-08-12,161077.0,1,1789,ALL,2,0,1.0,27607,raleigh
2,0.0,337500.0,2012-12-27,273621.0,1,2463,ALL,2,0,1.0,27615,raleigh
3,0.0,319000.0,2010-06-21,503301.0,1.5,4650,ALL,3.5,0,1.0,27608,raleigh
4,0.0,425000.0,2008-04-15,203178.0,2,1890,ALL,2,0,1.0,27604,raleigh
...,...,...,...,...,...,...,...,...,...,...,...,...
278908,0.0,1194000.0,2022-08-02,547787.0,1.88,4520,ALL,3.5,0,1.0,27502,apex
278909,0.0,1364500.0,2022-07-13,560914.0,1.88,4347,ALL,3.5,0,1.0,27502,apex
278910,0.0,220000.0,2022-06-07,93432.0,1,896,ALL,1,0,1.0,27529,garner
278911,0.0,170000.0,2021-05-03,119201.0,1,896,ALL,1,0,1.0,27529,garner


In [8]:
updated_house_price.isna().sum()

Land_Sale_Price            0
Total_sale_Price           0
Deed_Date                  0
Assessed_Building_Value    0
Story_Height               0
HEATED_AREA                0
UTILITIES                  0
BATH                       0
BATH_FIXTURES              0
TYPE_AND_USE               0
PHYSICAL_ZIP_CODE          0
PHYSICAL_CITY              0
dtype: int64

## Save as a CSV file

In [None]:
updated_house_price.to_csv('assets/updated_house_price.csv')