# flats-in-cracow data wrangling

## Imports

In [483]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer
from pylab import rcParams

## Setup

In [484]:
# Create directory for images 
Path("img").mkdir(parents=True, exist_ok=True)

NameError: name 'Path' is not defined

In [None]:
# Set default figure size
rcParams['figure.figsize'] = (4, 4)

In [None]:
# Tell pandas how to display floats
pd.options.display.float_format = "{:,.2f}".format

## Goal

I scraped listings of properties for sale in Cracow. We would like to create a model to predict flat prices.

## Data source

Data has been scraped from a website with listings. The data has undergone small transformations along the way. The goal of these transformations was to get the data into a usable state not to check it's validity.

## Data loading

In [None]:
path = '../flats-data/cleaned_data.csv'

In [485]:
data = pd.read_csv(path, lineterminator='\n')

In [486]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37602 entries, 0 to 37601
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         37504 non-null  object 
 1   City         24096 non-null  object 
 2   District     19525 non-null  object 
 3   Amount       37472 non-null  float64
 4   Currency     37472 non-null  object 
 5   Property     37223 non-null  object 
 6   Seller       37391 non-null  object 
 7   Area         37278 non-null  float64
 8   Rooms        36863 non-null  float64
 9   Bathrooms    23874 non-null  float64
 10  Parking      15867 non-null  object 
 11  Garden       37602 non-null  bool   
 12  Balcony      37602 non-null  bool   
 13  Terrace      37602 non-null  bool   
 14  Lift         37602 non-null  bool   
 15  New          37602 non-null  bool   
 16  Estate       37602 non-null  bool   
 17  Townhouse    37602 non-null  bool   
 18  Apartment    37602 non-null  bool   
 19  Land

In [487]:
data = data.drop_duplicates(['Title'])

In [488]:
print(data.shape)

(7329, 24)


## Data exploration

We see that we have 24 columns at our disposal. 
We inspect the numeric columns to see what we are dealing with. 
In the `Amount` column we note there is a property for sale that costs 1PLN, clearly a erroneous value. 
Next we note that the enourmous maximum in the `Amount` column. That is quite a lot of money and could be considered a potential outlier.
The maximum and minimum of the `Area` column also indicate the existance of outliers. These values are clearly too large. The data will need to undergo a filtering process.

In [489]:
data.describe()

Unnamed: 0,Amount,Area,Rooms,Bathrooms
count,7324.0,7237.0,7149.0,4252.0
mean,682022.25,155.16,2.91,1.33
std,715730.68,4260.91,1.32,0.64
min,1.0,1.0,1.0,1.0
25%,399000.0,43.0,2.0,1.0
50%,502256.0,57.0,3.0,1.0
75%,721999.25,81.0,4.0,2.0
max,22000000.0,320000.0,6.0,4.0


Next we check for missing values that we will have to deal with.

In [490]:
missing = data.isnull().sum(axis=0)
missing.name = 'Missing'
missing = missing.to_frame()
missing = missing[missing['Missing'] > 0]
missing.sort_values('Missing', ascending=False)

Unnamed: 0,Missing
Parking,4738
District,3111
Bathrooms,3077
City,1430
Description,1168
Rooms,180
Area,92
Seller,66
Property,59
Amount,5


## Data cleaning

### Filtering

We assume that if we know the district, the `City` is `kraków`.

In [491]:
mask = (data['City'].isna() == True) & (data['District'].isna() == False)
data.loc[mask, 'City'] = 'kraków'

Next we filter the data according to these rules:

In [492]:
data = data[data['City'] == 'kraków']
data = data[data['Currency'] == 'pln']
data = data[data['Property'] == 'flat']
data = data[(data['Amount'] >= data['Amount'].quantile(0.05))]
data = data[(data['Amount'] <= data['Amount'].quantile(0.95))]
data = data[(data['Area'] >= data['Area'].quantile(0.05))]
data = data[(data['Area'] <= data['Area'].quantile(0.95))]
data = data[data['District'] != 'unknown']
data = data[data['District'].isna() == False]
data = data[data['Seller'].isna() == False]
data = data[data['Description'].isna() == False]

In [493]:
data = data.reset_index(drop=True)

### Impute missing values

We transform the missing values in `Parking` to `'none'`.

In [494]:
data.loc[:, 'Parking'] = data['Parking'].apply(lambda x: 'none' if pd.isnull(x) else x)
data.loc[:, 'Parking'] = data['Parking'].apply(lambda x: 'none' if x == 'unknown' else x)

The next step is to fill in missing values for numeric columns `Amount` `Area` `Rooms` and `Bathrooms`. We use the KNN algorithm to accomplish this.

In [495]:
numeric = list(data.select_dtypes('number').columns)

In [496]:
mask = (data['Bathrooms'].isna() == True | data['Rooms'].isna())
missing = data[numeric]

imputer = KNNImputer(n_neighbors=5)
imputer.fit(missing)

missing = imputer.transform(missing)
missing = pd.DataFrame(missing, columns=numeric)

for col in numeric:
    data[col] = missing[col]
    
for col in numeric:
    data[col] = data[col].apply(lambda x: round(x))    

### Check categorical data

We inspect categorical columns to assert that they contain "valid" values. Most of these columns were generated by a script during the scraping and etl phase of the project.

In [497]:
categorical = data.select_dtypes('object').columns
categorical = categorical.to_list()
omit = ['Title', 'Link', 'Description', 'Date']

for col in categorical:
    if col not in omit:
        unique = data[col].unique()
        print(f'{col}:')    
        for value in unique:
            print(f'\t{value}')        

City:
	kraków
District:
	dębniki
	prądnik biały
	krowodrza
	bieżanów
	mistrzejowice
	grzegórzki
	stare miasto
	bronowice
	zwierzyniec
	czyżyny
	podgórze duchackie
	prądnik czerwony
	łagiewniki
	podgórze
	prokocim
	nowa huta
	bieńczyce
	swoszowice
	borek fałęcki
	wzgórza krzesławickie
Currency:
	pln
Property:
	flat
Seller:
	realtor
	owner
Parking:
	street
	none
	garage
	covered


### Check binary features

We inspect the data to see if binary columns are properly populated and check for imbalances.

In [498]:
binary = data.select_dtypes(bool).columns.to_list()

In [499]:
for col in binary:
    tmp = data[[col, 'Date']]
    tmp = tmp.groupby(col, as_index=False)
    tmp = tmp.count()
    print(tmp)
    print('\n')

   Garden  Date
0   False  2318
1    True   489


   Balcony  Date
0    False  1504
1     True  1303


   Terrace  Date
0    False  2533
1     True   274


    Lift  Date
0  False  2480
1   True   327


     New  Date
0  False  1685
1   True  1122


   Estate  Date
0   False  2480
1    True   327


   Townhouse  Date
0      False  2566
1       True   241


   Apartment  Date
0      False  2395
1       True   412


    Land  Date
0  False  2801
1   True     6


   Studio  Date
0   False  2702
1    True   105




In [500]:
print(data.shape)

(2807, 24)


## Save data

In [501]:
data = data.drop(['Title', 
                  'Description', 
                  'Link', 
                  'Property', 
                  'City', 
                  'Currency', 
                  'Date'], axis=1)

In [502]:
data.head()

Unnamed: 0,District,Amount,Seller,Area,Rooms,Bathrooms,Parking,Garden,Balcony,Terrace,Lift,New,Estate,Townhouse,Apartment,Land,Studio
0,dębniki,990000,realtor,93,4,2,street,False,False,False,False,False,False,False,False,False,False
1,prądnik biały,401430,realtor,41,2,1,none,False,True,True,True,True,False,False,False,False,False
2,krowodrza,520000,realtor,55,3,1,street,False,False,False,False,False,False,False,False,False,False
3,bieżanów,287387,realtor,38,2,1,garage,False,True,False,False,False,False,False,False,False,False
4,mistrzejowice,532928,realtor,61,2,1,none,False,False,False,False,True,True,False,False,False,False


In [503]:
data.to_csv('../flats-data/00_data.csv', index=False)