### Clean and Import Dataset: Avg House Price 2020

In [233]:
import pandas as pd
import numpy as np

In [234]:
df = pd.read_csv('nyc_housing_prices_jul_2020.csv')
df.head()

Unnamed: 0,borough,neigborhood,studio,1_bedroom,2_bedroom,3_bedroom
0,manhatan,Chelsea,"$2,794","$4,062","$6,997","$11,031"
1,manhatan,East Village,"$2,608","$3,280","$4,044","$5,289"
2,manhatan,Flatiron/Union Square,"$3,632","$4,645","$8,267","$15,969"
3,manhatan,Gramercy Park,"$2,569","$3,656","$5,413","$7,226"
4,manhatan,Greenwich Village,"$2,828","$4,116","$8,000","$12,418"


In [235]:
borough = df['borough'].tolist()
borough = [word.replace('manhatan', 'Manhattan') for word in borough]
borough = [word.replace('brooklyn', 'Brooklyn') for word in borough]

In [236]:
studio = df['studio'].tolist()
studio = [word.replace('-', '0') for word in studio]
studio = [word.replace('$', '') for word in studio]
studio = [word.replace(',', '') for word in studio]
studio = [int(s) for s in studio]

In [237]:
bed1 = df['1_bedroom'].tolist()
bed1 = [word.replace('$', '') for word in bed1]
bed1 = [word.replace(',', '') for word in bed1]
bed1 = [int(s) for s in bed1]

In [238]:
bed2 = df['2_bedroom'].tolist()
bed2 = [word.replace('$', '') for word in bed2]
bed2 = [word.replace(',', '') for word in bed2]
bed2 = [int(s) for s in bed2]

In [239]:
bed3 = df['3_bedroom'].tolist()
bed3 = [word.replace('-', '0') for word in bed3]
bed3 = [word.replace('$', '') for word in bed3]
bed3 = [word.replace(',', '') for word in bed3]
bed3 = [int(s) for s in bed3]

In [240]:
house_2020 = pd.DataFrame({'borough': borough, 
                            'neighbourhood': df['neigborhood'],
                            'studio': studio,
                            '1_bed': bed1,
                            '2_bed': bed2,
                            '3_bed': bed3})

house_2020.head()

Unnamed: 0,borough,neighbourhood,studio,1_bed,2_bed,3_bed
0,Manhattan,Chelsea,2794,4062,6997,11031
1,Manhattan,East Village,2608,3280,4044,5289
2,Manhattan,Flatiron/Union Square,3632,4645,8267,15969
3,Manhattan,Gramercy Park,2569,3656,5413,7226
4,Manhattan,Greenwich Village,2828,4116,8000,12418


In [264]:
# replace values that are '-' with NaN (null) as to not skew numeric data
house_2020 = house_2020.replace(0, np.nan)

In [265]:
house_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   borough        55 non-null     object 
 1   neighbourhood  55 non-null     object 
 2   studio         54 non-null     float64
 3   1_bed          55 non-null     int64  
 4   2_bed          55 non-null     int64  
 5   3_bed          54 non-null     float64
dtypes: float64(2), int64(2), object(2)
memory usage: 2.7+ KB


In [266]:
house_2020.isnull().sum()

borough          0
neighbourhood    0
studio           1
1_bed            0
2_bed            0
3_bed            1
dtype: int64

In [267]:
house_2020.describe()

Unnamed: 0,studio,1_bed,2_bed,3_bed
count,54.0,55.0,55.0,54.0
mean,2522.518519,3226.072727,4960.6,7777.259259
std,630.860232,904.262803,2167.655156,4822.671104
min,1400.0,1866.0,2273.0,2500.0
25%,2128.75,2535.5,3345.0,4115.75
50%,2404.5,3257.0,4411.0,6159.5
75%,2711.5,3668.0,6075.5,10651.75
max,5121.0,6117.0,12173.0,26970.0


In [268]:
house_2020[house_2020.duplicated()]

Unnamed: 0,borough,neighbourhood,studio,1_bed,2_bed,3_bed


### Clean and Import Dataset: Avg House Price 2021

In [245]:
house_2021 = pd.read_csv('nyc_housing_prices_feb_2021.csv')
house_2021.head()

Unnamed: 0,neighborhood,studio,Unnamed: 2,neighborhood.1,1_bedroom,Unnamed: 5,neighborhood.2,2_bedroom,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Upper West Side,"$1,975",,Upper West Side,"$2,750",,Upper West Side,"$4,350",,,
1,Bedford-Stuyvesant,"$1,650",,Bedford-Stuyvesant,"$1,975",,Bedford-Stuyvesant,"$2,200",,,
2,Bushwick,"$1,731",,Bushwick,"$2,150",,Bushwick,"$2,100",,,
3,Upper East Side,"$1,898",,Upper East Side,"$2,395",,Upper East Side,"$3,295",,,
4,Williamsburg,"$2,491",,Williamsburg,"$2,500",,Williamsburg,"$2,584",,,


In [246]:
house_2021 = house_2021.drop(columns=['neighborhood', 'neighborhood.1', 'Unnamed: 2','Unnamed: 5', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10'])
house_2021 = house_2021[['neighborhood.2', 'studio', '1_bedroom', '2_bedroom']]

In [247]:
house_2021.columns = ['neighborhood', 'studio', '1_bed', '2_bed']
house_2021.head()

Unnamed: 0,neighborhood,studio,1_bed,2_bed
0,Upper West Side,"$1,975","$2,750","$4,350"
1,Bedford-Stuyvesant,"$1,650","$1,975","$2,200"
2,Bushwick,"$1,731","$2,150","$2,100"
3,Upper East Side,"$1,898","$2,395","$3,295"
4,Williamsburg,"$2,491","$2,500","$2,584"


In [248]:
house_2021 = house_2021.fillna('-')

In [249]:
studio = house_2021['studio'].tolist()
studio = [word.replace('-', '0') for word in studio]
studio = [word.replace('$', '') for word in studio]
studio = [word.replace(',', '') for word in studio]
studio = [int(s) for s in studio]

In [250]:
bed1 = house_2021['1_bed'].tolist()
bed1 = [word.replace('-', '0') for word in bed1]
bed1 = [word.replace('$', '') for word in bed1]
bed1 = [word.replace(',', '') for word in bed1]
bed1 = [int(s) for s in bed1]

In [251]:
bed2 = house_2021['2_bed'].tolist()
bed2 = [word.replace('-', '0') for word in bed2]
bed2 = [word.replace('$', '') for word in bed2]
bed2 = [word.replace(',', '') for word in bed2]
bed2 = [int(s) for s in bed2]

In [269]:
house_2021_n = pd.DataFrame({'neighbourhood': house_2021['neighborhood'],
                                'studio': studio,
                                '1_bed': bed1,
                                '2_bed': bed2,
                                })

house_2021_n.head()

Unnamed: 0,neighbourhood,studio,1_bed,2_bed
0,Upper West Side,1975,2750,4350
1,Bedford-Stuyvesant,1650,1975,2200
2,Bushwick,1731,2150,2100
3,Upper East Side,1898,2395,3295
4,Williamsburg,2491,2500,2584


In [270]:
# replace values that are '-' with NaN (null) as to not skew numeric data
house_2021_clean = house_2021_n.replace(0, np.nan)

In [271]:
house_2021_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   neighbourhood  205 non-null    object 
 1   studio         137 non-null    float64
 2   1_bed          194 non-null    float64
 3   2_bed          205 non-null    int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 6.5+ KB


In [272]:
house_2021_clean.isnull().sum()

neighbourhood     0
studio           68
1_bed            11
2_bed             0
dtype: int64

In [273]:
house_2021_clean[house_2021_clean.duplicated()]

Unnamed: 0,neighbourhood,studio,1_bed,2_bed


In [274]:
house_2021_clean.describe()

Unnamed: 0,studio,1_bed,2_bed
count,137.0,194.0,205.0
mean,2019.386861,1999.505155,2522.102439
std,726.45807,656.723483,1104.520526
min,947.0,1100.0,1377.0
25%,1575.0,1600.0,1950.0
50%,1850.0,1750.0,2109.0
75%,2300.0,2287.5,2645.0
max,5900.0,5483.0,9295.0


### Clean and Import Dataset: Avg House Price 2021 Zip Codes

In [275]:
house_2021_zip = pd.read_csv('nyc_housing_prices_feb_2021_zip.csv')
house_2021_zip.head()

Unnamed: 0,borough,neighborhood,zip_codes
0,Bronx,Central Bronx,"10453, 10457, 10460"
1,Bronx,Bronx Park and Fordham,"10458, 10467, 10468"
2,Bronx,High Bridge and Morrisania,"10451, 10452, 10456"
3,Bronx,Hunts Point and Mott Haven,"10454, 10455, 10459, 10474"
4,Bronx,Kingsbridge and Riverdale,"10463, 10471"


Looks good, may potentially split zip codes column to independent columns