# Zillow Data Cleaner

In [1]:
import pandas as pd
import numpy as np
import project1scripts

In [2]:
whole_df = pd.read_csv(r'ZillowData/2023 0927 - Metro_median_sale_price_uc_sfrcondo_sm_month.csv', encoding='utf-8', index_col=False)
whole_df = whole_df.loc[:, ~whole_df.columns.str.contains('Unnamed')]
whole_df = whole_df.loc[:, ~whole_df.columns.str.contains('RegionType')]
irrelevant_years = ['2008', '2009', '2010', '2011', '2012']
for year in irrelevant_years:
    whole_df = whole_df.loc[:, ~whole_df.columns.str.contains(year)]
whole_df = whole_df.drop(['RegionID', 'SizeRank'], axis=1)
whole_df = whole_df.iloc[1:, :]
whole_df = whole_df.dropna(thresh=120, axis=0).reset_index(drop=True)
whole_df

Unnamed: 0,RegionName,StateName,1/31/2013,2/28/2013,3/31/2013,4/30/2013,5/31/2013,6/30/2013,7/31/2013,8/31/2013,...,10/31/2022,11/30/2022,12/31/2022,1/31/2023,2/28/2023,3/31/2023,4/30/2023,5/31/2023,6/30/2023,7/31/2023
0,"New York, NY",NY,344911.00,339350.00,335350.00,333517.00,342000.00,356000.00,371000.00,380667.00,...,543333.00,528333.00,518333.00,515333.00,509500.00,510333.00,516667.00,535833.00,558333.00,579667.00
1,"Los Angeles, CA",CA,394000.00,397333.00,407333.00,430667.00,450667.00,465000.00,468333.00,471667.00,...,852000.00,840333.00,827000.00,818333.00,816667.00,827333.00,839000.00,854000.00,873333.00,888333.00
2,"Chicago, IL",IL,162833.00,158000.00,160218.00,170218.00,183385.00,197833.00,207500.00,212333.00,...,285000.00,278333.00,273333.00,270000.00,269000.00,275633.00,287300.00,300300.00,313167.00,319833.00
3,"Dallas, TX",TX,151667.00,152167.00,155333.00,164500.00,172800.00,178467.00,181633.00,181125.00,...,390500.00,382500.00,373333.00,363333.00,362333.00,367000.00,377000.00,384667.00,395000.00,400333.00
4,"Houston, TX",TX,149833.00,148572.00,150405.00,160155.00,167417.00,174900.00,177650.00,178650.00,...,321000.00,316000.00,310667.00,306000.00,304333.00,306333.00,313333.00,321667.00,328667.00,333667.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,"Easton, MD",MD,,,281833.00,279167.00,295667.00,279542.00,311208.00,296875.00,...,435833.00,449167.00,432483.00,402483.00,402567.00,390900.00,380900.00,402483.00,436667.00,475000.00
471,"Kill Devil Hills, NC",NC,414000.00,384000.00,352833.00,307833.00,303833.00,292500.00,288333.00,281667.00,...,526333.00,541333.00,508333.00,531667.00,520500.00,536333.00,524667.00,525833.00,526667.00,524833.00
472,"Brevard, NC",NC,152333.00,155333.00,157333.00,150667.00,159417.00,167417.00,190417.00,196167.00,...,407333.00,410667.00,418167.00,419000.00,397500.00,346667.00,357000.00,396167.00,439500.00,465833.00
473,"Breckenridge, CO",CO,363750.00,393417.00,402167.00,390500.00,374425.00,348592.00,355175.00,339583.00,...,907833.00,922833.00,943667.00,981738.00,981404.00,907238.00,937500.00,896167.00,920500.00,901000.00


In [3]:
whole_df.info()
print('Number of NaN Values:', whole_df.isna().sum().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 475 entries, 0 to 474
Columns: 129 entries, RegionName to 7/31/2023
dtypes: object(129)
memory usage: 478.8+ KB
Number of NaN Values: 20


In [4]:
whole_df.dtypes

RegionName    object
StateName     object
1/31/2013     object
2/28/2013     object
3/31/2013     object
               ...  
3/31/2023     object
4/30/2023     object
5/31/2023     object
6/30/2023     object
7/31/2023     object
Length: 129, dtype: object

In [5]:
whole_df = project1scripts.convert_to_numeric(whole_df, 2)

In [6]:
whole_df.dtypes

RegionName     object
StateName      object
1/31/2013     float64
2/28/2013     float64
3/31/2013     float64
               ...   
3/31/2023     float64
4/30/2023     float64
5/31/2023     float64
6/30/2023     float64
7/31/2023     float64
Length: 129, dtype: object

In [7]:
whole_df = project1scripts.fill_zero_with_avg(whole_df)

In [8]:
whole_df

Unnamed: 0,RegionName,StateName,1/31/2013,2/28/2013,3/31/2013,4/30/2013,5/31/2013,6/30/2013,7/31/2013,8/31/2013,...,10/31/2022,11/30/2022,12/31/2022,1/31/2023,2/28/2023,3/31/2023,4/30/2023,5/31/2023,6/30/2023,7/31/2023
0,"New York, NY",NY,344911.0,339350.0,335350.0,333517.0,342000.0,356000.0,371000.0,380667.0,...,543333.0,528333.0,518333.0,515333.0,509500.0,510333.0,516667.0,535833.0,558333.0,579667.0
1,"Los Angeles, CA",CA,394000.0,397333.0,407333.0,430667.0,450667.0,465000.0,468333.0,471667.0,...,852000.0,840333.0,827000.0,818333.0,816667.0,827333.0,839000.0,854000.0,873333.0,888333.0
2,"Chicago, IL",IL,162833.0,158000.0,160218.0,170218.0,183385.0,197833.0,207500.0,212333.0,...,285000.0,278333.0,273333.0,270000.0,269000.0,275633.0,287300.0,300300.0,313167.0,319833.0
3,"Dallas, TX",TX,151667.0,152167.0,155333.0,164500.0,172800.0,178467.0,181633.0,181125.0,...,390500.0,382500.0,373333.0,363333.0,362333.0,367000.0,377000.0,384667.0,395000.0,400333.0
4,"Houston, TX",TX,149833.0,148572.0,150405.0,160155.0,167417.0,174900.0,177650.0,178650.0,...,321000.0,316000.0,310667.0,306000.0,304333.0,306333.0,313333.0,321667.0,328667.0,333667.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,"Easton, MD",MD,241246.0,224825.0,281833.0,279167.0,295667.0,279542.0,311208.0,296875.0,...,435833.0,449167.0,432483.0,402483.0,402567.0,390900.0,380900.0,402483.0,436667.0,475000.0
471,"Kill Devil Hills, NC",NC,414000.0,384000.0,352833.0,307833.0,303833.0,292500.0,288333.0,281667.0,...,526333.0,541333.0,508333.0,531667.0,520500.0,536333.0,524667.0,525833.0,526667.0,524833.0
472,"Brevard, NC",NC,152333.0,155333.0,157333.0,150667.0,159417.0,167417.0,190417.0,196167.0,...,407333.0,410667.0,418167.0,419000.0,397500.0,346667.0,357000.0,396167.0,439500.0,465833.0
473,"Breckenridge, CO",CO,363750.0,393417.0,402167.0,390500.0,374425.0,348592.0,355175.0,339583.0,...,907833.0,922833.0,943667.0,981738.0,981404.0,907238.0,937500.0,896167.0,920500.0,901000.0


In [9]:
print('Number of NaN Values:', whole_df.isna().sum().sum())

Number of NaN Values: 2


In [10]:
whole_df = whole_df.fillna(0)
print('Number of NaN Values:', whole_df.isna().sum().sum())

Number of NaN Values: 0


In [11]:
whole_df.to_csv('clean_median_sale_price_v2_df.csv')