In [1]:
import pandas as pd
import os
import missingno as msno
import datetime
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read csv files
house_price_df = pd.read_csv("assets\house_price.csv")
house_price_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,OWNER1,OWNER2,Mailing_address1,Mailing_Address2,Mailing_Address3,REAL_ESTATE_ID,CARD_NUMBER,NUMBER_OF_CARDS,Street_Number,Street_Prefix,...,Land_Deferred_code,Land_Deferred_Amount,Historic_Deferred_code,Historic_Deferred_Amount,RECYCLED_UNITS,Disq_and_Qual_flag,Land_Disq_and_Qual_flag,TYPE_AND_USE,PHYSICAL_CITY,PHYSICAL_ZIP_CODE
0,NORTHGATE PROPERTIES LLC,,8465 LENTIC CT,RALEIGH NC 27615-4964,,1,1,1,1506,,...,,0,,0,0,,,,RALEIGH,27604.0
1,BONES 13 LLC,,306 MIDENHALL WAY,CARY NC 27513-5569,,3,1,2,6012,,...,,0,,0,0,A,,20.0,RALEIGH,27617.0
2,BONES 13 LLC,,306 MIDENHALL WAY,CARY NC 27513-5569,,3,2,2,6012,,...,,0,,0,0,A,,34.0,RALEIGH,27617.0
3,HARRISPARK PROPERTIES INC,,608 GASTON ST STE 200,RALEIGH NC 27603-1258,,4,1,1,1601,,...,,0,,0,0,T,,85.0,RALEIGH,27604.0
4,"MOGHADASS, INC",,5040 ISABELLA CANNON DR,RALEIGH NC 27612-4804,,5,1,1,1831,,...,,0,,0,0,A,,200.0,RALEIGH,27604.0


In [3]:
# house_price_df.columns

In [4]:
updated_house_price = house_price_df[['Land_Sale_Price', 'Total_sale_Price', 'Deed_Date', 'Assessed_Building_Value', 'Story_Height', 'HEATED_AREA',
       'UTILITIES', 'Remodeled_Year', 'BATH', 'TYPE_AND_USE', 'PHYSICAL_ZIP_CODE', 'PHYSICAL_CITY']].copy()
updated_house_price['Land_Sale_Price'] = updated_house_price['Land_Sale_Price'].str.replace(',', '').astype(float)
updated_house_price['Total_sale_Price'] = updated_house_price['Total_sale_Price'].str.replace(',', '').astype(float)
updated_house_price['Assessed_Building_Value'] = updated_house_price['Assessed_Building_Value'].str.replace(',', '').astype(float)
updated_house_price['PHYSICAL_ZIP_CODE'] = updated_house_price['PHYSICAL_ZIP_CODE'].fillna(0).astype(int)
updated_house_price['HEATED_AREA'] = updated_house_price['HEATED_AREA'].fillna(0).astype(int)
updated_house_price['Remodeled_Year'] = pd.to_datetime(updated_house_price['Remodeled_Year'], errors='coerce')
updated_house_price['Deed_Date'] = pd.to_datetime(updated_house_price['Deed_Date'], errors='coerce')

# convert bath category to numeric
updated_house_price.loc[updated_house_price['BATH'] == 'A', 'BATH'] = 1
updated_house_price.loc[updated_house_price['BATH'] == 'B', 'BATH'] = 1.5
updated_house_price.loc[updated_house_price['BATH'] == 'C', 'BATH'] = 2
updated_house_price.loc[updated_house_price['BATH'] == 'D', 'BATH'] = 2.5
updated_house_price.loc[updated_house_price['BATH'] == 'E', 'BATH'] = 3
updated_house_price.loc[updated_house_price['BATH'] == 'F', 'BATH'] = 3.5
updated_house_price.loc[updated_house_price['BATH'] == 'H', 'BATH'] = 0
updated_house_price.loc[updated_house_price['BATH'] == 'I', 'BATH'] = 1
updated_house_price.loc[updated_house_price['BATH'] == 'J', 'BATH'] = 0

# convert story height to numeric
updated_house_price.loc[updated_house_price['Story_Height'] == 'A', 'Story_Height'] = 1
updated_house_price.loc[updated_house_price['Story_Height'] == 'B', 'Story_Height'] = 1.5
updated_house_price.loc[updated_house_price['Story_Height'] == 'C', 'Story_Height'] = 2
updated_house_price.loc[updated_house_price['Story_Height'] == 'D', 'Story_Height'] = 2.5
updated_house_price.loc[updated_house_price['Story_Height'] == 'E', 'Story_Height'] = 3
updated_house_price.loc[updated_house_price['Story_Height'] == 'F', 'Story_Height'] = 3.5
updated_house_price.loc[updated_house_price['Story_Height'] == 'G', 'Story_Height'] = 4
updated_house_price.loc[updated_house_price['Story_Height'] == 'H', 'Story_Height'] = 5
updated_house_price.loc[updated_house_price['Story_Height'] == 'I', 'Story_Height'] = 1.75
updated_house_price.loc[updated_house_price['Story_Height'] == 'J', 'Story_Height'] = 1.4
updated_house_price.loc[updated_house_price['Story_Height'] == 'K', 'Story_Height'] = 1.63
updated_house_price.loc[updated_house_price['Story_Height'] == 'L', 'Story_Height'] = 1.88
updated_house_price.loc[updated_house_price['Story_Height'] == 'M', 'Story_Height'] = 2.4
updated_house_price.loc[updated_house_price['Story_Height'] == 'N', 'Story_Height'] = 2.63
updated_house_price.loc[updated_house_price['Story_Height'] == 'O', 'Story_Height'] = 2.75


updated_house_price.head()

Unnamed: 0,Land_Sale_Price,Total_sale_Price,Deed_Date,Assessed_Building_Value,Story_Height,HEATED_AREA,UTILITIES,Remodeled_Year,BATH,TYPE_AND_USE,PHYSICAL_ZIP_CODE,PHYSICAL_CITY
0,0.0,0.0,2020-09-30,0.0,,0,ALL,1970-01-01,,,27604,RALEIGH
1,0.0,740000.0,2012-03-05,524503.0,1.0,4500,ALL,1970-01-01,1.0,20.0,27617,RALEIGH
2,0.0,740000.0,2012-03-05,524503.0,1.0,4008,ALL,1970-01-01,1.0,34.0,27617,RALEIGH
3,0.0,185000.0,2011-01-19,374128.0,1.0,8500,ALL,1970-01-01,1.0,85.0,27604,RALEIGH
4,0.0,750000.0,2016-06-17,238511.0,1.0,3088,ALL,1970-01-01,1.0,200.0,27604,RALEIGH


In [5]:
# According to the U.S. Census Bureau, a single-family house is one that may be fully detached, semi-detached, a row house or a townhome. df.loc[df['column_name'].isin(some_values)]
updated_house_price = updated_house_price.loc[updated_house_price['TYPE_AND_USE'].isin([1, 8])]
updated_house_price.loc[updated_house_price['Deed_Date'].isnull(), 'Deed_Date'] = updated_house_price['Remodeled_Year']

In [6]:
# Remove non wake county cities
# Wake county cities: Apex, Cary, Fuquay-Varina, Garner, Holly Springs, Knightdale, Morrisville, Raleigh, Rolesville, Wake Forest, Wendell, Zebulon
# Reference: https://www.wakegov.com/living-visiting/cities-towns
wake_cities = ['APEX', 'CARY', 'FUQUAY VARINA', 'GARNER', 'HOLLY SPRINGS', 'KNIGHTDALE', 'MORRISVILLE', 'RALEIGH', 'ROLESVILLE', 'WAKE FOREST', 'WENDELL', 'ZEBULON']
updated_house_price = updated_house_price.drop(updated_house_price[~updated_house_price['PHYSICAL_CITY'].isin(wake_cities)].index)
updated_house_price['PHYSICAL_CITY'].unique()

array(['RALEIGH', 'WENDELL', 'APEX', 'CARY', 'FUQUAY VARINA',
       'KNIGHTDALE', 'GARNER', 'WAKE FOREST', 'HOLLY SPRINGS', 'ZEBULON',
       'ROLESVILLE', 'MORRISVILLE'], dtype=object)

In [7]:
updated_house_price = updated_house_price.loc[updated_house_price['Deed_Date'] > "2000-01-01"]
print(f"Total number of zero total sale price is {len(updated_house_price[updated_house_price['Total_sale_Price'] == 0])}")
print(f"Total number of zero land sale price is {len(updated_house_price[updated_house_price['Land_Sale_Price'] == 0])}")
updated_house_price = updated_house_price.loc[updated_house_price['Total_sale_Price'] != 0]


updated_house_price.drop(columns=['Remodeled_Year', ], inplace=True)
updated_house_price.reset_index(drop=True, inplace=True)
updated_house_price

Total number of zero total sale price is 14014
Total number of zero land sale price is 171869


Unnamed: 0,Land_Sale_Price,Total_sale_Price,Deed_Date,Assessed_Building_Value,Story_Height,HEATED_AREA,UTILITIES,BATH,TYPE_AND_USE,PHYSICAL_ZIP_CODE,PHYSICAL_CITY
0,0.0,37500.0,2004-09-16,118723.0,1,2261,WSE,2,1.0,27591,WENDELL
1,0.0,380000.0,2015-08-12,161077.0,1,1789,ALL,2,1.0,27607,RALEIGH
2,0.0,337500.0,2012-12-27,273621.0,1,2463,ALL,2,1.0,27615,RALEIGH
3,0.0,319000.0,2010-06-21,503301.0,1.5,4650,ALL,3.5,1.0,27608,RALEIGH
4,0.0,425000.0,2008-04-15,203178.0,2,1890,ALL,2,1.0,27604,RALEIGH
...,...,...,...,...,...,...,...,...,...,...,...
282217,0.0,1194000.0,2022-08-02,547787.0,1.88,4520,ALL,3.5,1.0,27502,APEX
282218,0.0,1364500.0,2022-07-13,560914.0,1.88,4347,ALL,3.5,1.0,27502,APEX
282219,0.0,220000.0,2022-06-07,93432.0,1,896,ALL,1,1.0,27529,GARNER
282220,0.0,170000.0,2021-05-03,119201.0,1,896,ALL,1,1.0,27529,GARNER


In [8]:
print(f"The average sale price is ${round(updated_house_price['Total_sale_Price'].mean(), 2)}")
print(f"The average land sale price is ${round(updated_house_price['Land_Sale_Price'].mean(), 2)}")
print(f"The unique values of the story height are {updated_house_price['Story_Height'].value_counts()}")
print(f"The unique values of the bath room are {updated_house_price['BATH'].value_counts()}")

The average sale price is $331742.46
The average land sale price is $26403.92
The unique values of the story height are 2.00    134350
1.00     82368
1.50     20263
1.75     18870
1.88     12284
1.63      6972
1.40      4395
3.00      1314
2.50        24
2.40         3
4.00         1
2.75         1
2.63         1
Name: Story_Height, dtype: int64
The unique values of the bath room are 2.5    119127
3.5     56202
2.0     48319
3.0     27869
0.0     10451
1.0     10217
1.5      8647
Name: BATH, dtype: int64


## Save as a CSV file

In [9]:
# updated_house_price.to_csv('assets/updated_house_price.csv')