<font size="+3"><strong>Playground</strong></font>

Load libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

During data loading we will use knowledge received from the prevous section: **00_exploring_data**.

In [None]:
def load_data(file_path):
    df = pd.read_json(file_path)
    
    # mask only known prices
    known_price_mask = (df['price'].str.contains(pat='€ \d+[\d,]*\d+ kosten koper', regex=True) == True) \
                       | (df['price'].str.contains(pat='€ \d+[\d,]*\d+ vrij op naam', regex=True) == True)
    df = df[known_price_mask]
    
    # convert prices to numbers
    df['price'] = df['price'].str.split(' ').str[1].str.replace(',', '').astype(float) / 1000
    
    # convert living areato numbers
    df['living_area'] = df['living_area'].str.replace(',', '').astype(float)
    
    # keep properties with clear construction date
    df['year_of_construction'] = pd.to_numeric(df['year_of_construction'], errors='coerce')
    
    return df

df = load_data('../data/funda_15_08_2022.json')
df.info()

## Price distribution by towns

I'm interested in properties not older than 1990 and between 80-100 square meters (which is much smaller than average property on Funda).

In [None]:
# filter by construction date (after 1990 and should be already built)
mask_year_of_construction = (df['year_of_construction'] >= 1990) & (df['year_of_construction'] < 2023)
df = df[mask_year_of_construction]

# filter by living area
mask_living_area = df['living_area'].between(80, 100)
df = df[mask_living_area]

# drop small towns (less than 20 properties of required parameters on sale)
value_counts = df['town'].value_counts()
to_remove = value_counts[value_counts < 20].index
df = df[~df.town.isin(to_remove)]
print(f'Towns with matched properties:\n{value_counts}')

# trimming the botton and top 10% by price for each town
data_frame = pd.DataFrame()
for town in df['town'].unique():
    partial_mask = df['town'] == town
    min, max = df[partial_mask]['price'].quantile([0.1, 0.9])
    mask = (df['town'] == town) & (df['price'].between(min, max))
    data_frame = pd.concat([df, df[mask]])
df = data_frame

# draw plot
f = plt.figure()
f.set_figwidth(20)
f.set_figheight(5)
sns.boxplot(x="price", y="town", data=df, orient='h')
plt.xlabel('Price in thousands of EUR')
plt.ylabel('Town')
plt.title('Distribution of Home Prices')
plt.show()