# flats-in-cracow data wrangling

## Imports

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from collections import Counter
from IPython.display import display
from sklearn.impute import KNNImputer
from pylab import rcParams
from pathlib import Path

## Setup

In [2]:
# Create directory for images 
Path("img").mkdir(parents=True, exist_ok=True)

# Set default figure size
rcParams['figure.figsize'] = (4, 4)

# Tell pandas how to display floats
pd.options.display.float_format = "{:,.2f}".format

## Goal

I scraped listings of properties for sale in Cracow. We would like to create a model to predict flat prices.

## Data source

Data has been scraped from a website with listings. The data has undergone small transformations along the way. The goal of these transformations was to get the data into a usable state not to check it's validity.

## Data loading

In [3]:
path = '../flats-data/raw_data.csv'

In [4]:
data = pd.read_csv(path, lineterminator='\n')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40565 entries, 0 to 40564
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         40463 non-null  object 
 1   City         29279 non-null  object 
 2   District     22014 non-null  object 
 3   Amount       40426 non-null  float64
 4   Currency     40426 non-null  object 
 5   Property     40160 non-null  object 
 6   Seller       40345 non-null  object 
 7   Area         40223 non-null  float64
 8   Rooms        39742 non-null  float64
 9   Bathrooms    25830 non-null  float64
 10  Parking      16867 non-null  object 
 11  Garden       40565 non-null  bool   
 12  Balcony      40565 non-null  bool   
 13  Terrace      40565 non-null  bool   
 14  Floor        40565 non-null  bool   
 15  New          40565 non-null  bool   
 16  Estate       40565 non-null  bool   
 17  Townhouse    40565 non-null  bool   
 18  Apartment    40565 non-null  bool   
 19  Land

I assume that the `Title` column uniquely identifies a listing.

In [6]:
data = data.drop_duplicates(['Title'])

In [7]:
print(data.shape)

(7689, 24)


## Data exploration

We check for missing values that we will have to deal with.

In [8]:
missing = data.isnull().sum(axis=0)
missing.name = 'Missing'
missing = missing.to_frame()
missing = missing[missing['Missing'] > 0]
missing.sort_values('Missing', ascending=False)

Unnamed: 0,Missing
Parking,4972
Bathrooms,3201
District,3140
City,1364
Description,1274
Rooms,192
Area,96
Seller,69
Property,65
Amount,6


### Check numeric columns

We see that we have 24 columns at our disposal. 
We inspect the numeric columns to see what we are dealing with. 
In the `Amount` column we note there is a property for sale that costs 1PLN, clearly a erroneous value. 
Next we note that the enourmous maximum in the `Amount` column. That is quite a lot of money and could be considered a potential outlier.
The maximum and minimum of the `Area` column also indicate the existance of outliers. These values are clearly too large. The data will need to undergo a filtering process.

In [9]:
data.describe()

Unnamed: 0,Amount,Area,Rooms,Bathrooms
count,7683.0,7593.0,7497.0,4488.0
mean,676854.52,151.12,2.9,1.32
std,703132.65,4159.87,1.32,0.64
min,1.0,1.0,1.0,1.0
25%,397000.0,43.0,2.0,1.0
50%,500000.0,56.0,3.0,1.0
75%,720000.0,80.0,4.0,2.0
max,22000000.0,320000.0,6.0,4.0


### Check binary columns

We inspect the data to see if binary columns are properly populated and check for imbalances.

In [10]:
binary = data.select_dtypes(bool).columns.to_list()

for col in binary:
    tmp = data[[col, 'Amount']]
    tmp = tmp.fillna('NaN')
    tmp = tmp.groupby(col, as_index=False)
    tmp = tmp.count()
    tmp = tmp.rename(columns={'Amount': 'Count'})
    tmp = tmp.sort_values('Count', ascending=False)
    tmp = tmp.reset_index(drop=True)
    display(tmp)

Unnamed: 0,Garden,Count
0,False,6206
1,True,1483


Unnamed: 0,Balcony,Count
0,False,5043
1,True,2646


Unnamed: 0,Terrace,Count
0,False,6780
1,True,909


Unnamed: 0,Floor,Count
0,False,4682
1,True,3007


Unnamed: 0,New,Count
0,False,5153
1,True,2536


Unnamed: 0,Estate,Count
0,False,6563
1,True,1126


Unnamed: 0,Townhouse,Count
0,False,7000
1,True,689


Unnamed: 0,Apartment,Count
0,False,6588
1,True,1101


Unnamed: 0,Land,Count
0,False,5973
1,True,1716


Unnamed: 0,Studio,Count
0,False,7203
1,True,486


### Check categorical columns

We inspect categorical columns to assert that they contain "valid" values. Most of these columns were generated by a script during the scraping and etl phase of the project.

In [11]:
categorical = data.select_dtypes('object').columns
categorical = categorical.to_list()
omit = ['Title', 'Link', 'Description', 'Date']

for col in categorical:
    if col not in omit:
        tmp = data[['Amount', col]].copy()
        tmp = tmp.fillna('NaN')
        tmp = tmp.groupby(col, as_index=False)
        tmp = tmp.count()
        tmp = tmp.rename(columns={'Amount': 'Count'})
        tmp = tmp.sort_values('Count', ascending=False)
        tmp = tmp.reset_index(drop=True)
        display(tmp)

Unnamed: 0,City,Count
0,kraków,6325
1,,1364


Unnamed: 0,District,Count
0,,3140
1,krowodrza,615
2,podgorze,539
3,stare miasto,517
4,nowa huta,371
5,debniki,326
6,bronowice,307
7,pradnik bialy,299
8,biezanow,234
9,pradnik czerwony,224


Unnamed: 0,Currency,Count
0,pln,7683
1,,6


Unnamed: 0,Property,Count
0,flat,6590
1,house,1034
2,,65


Unnamed: 0,Seller,Count
0,realtor,7065
1,owner,555
2,,69


Unnamed: 0,Parking,Count
0,,4972
1,street,1118
2,garage,1030
3,no parking,429
4,covered,140


### Check text columns

We search for keywords in the data.

In [12]:
# text = data[data['Description'].isna() == False].copy()
# text = text['Description'].to_list()
# text = ' '.join(text)
# text = text.split(' ')
# text = [x for x in text if x.isalpha()]
# text = [x for x in text if len(x) > 3]

In [13]:
# for i in range(5, len(text)-5):
#     if 'piętro' in text[i]:    
#         s = text[i-5:i+5]
#         s = ' '.join(s)
#         print(s)

## Data cleaning

We assume that if we know the district, the `City` is `kraków`.

In [14]:
mask = (data['City'].isna() == True) & (data['District'].isna() == False)
data.loc[mask, 'City'] = 'kraków'

We extract more `Parking` information from the property description.

In [15]:
def extract_parking(x):
    if ('garaż' in x or 'garaz' in x or 'parking' in x) and 'podziemny' in x:
        return 'covered'
    elif ('garaż' in x or 'garaz' in x) and 'podziemny' not in x:
        return 'garage'
    elif 'parking' in x and 'podziemny' not in x:
        return 'street'
    else:
        return 'no parking'

In [16]:
mask = (data['Parking'].isna() == True) & (data['Description'].isna() == False)
data.loc[mask, ['Parking', 'Description']] = data.loc[mask, 'Description'].apply(extract_parking)

In [17]:
mask = data['Parking'].isna() == True
data.loc[mask, 'Parking'] = 'no parking'

We confirm that we have dealt with all the `NaN`s in the `Parking` column.

In [18]:
print(data['Parking'].isna().sum())

0


### Filtering

Next we filter the data according to these rules:

In [19]:
data = data[data['City'] == 'kraków']
data = data[data['Currency'] == 'pln']
data = data[data['Property'] == 'flat']
data = data[(data['Amount'] >= data['Amount'].quantile(0.025))]
data = data[(data['Amount'] <= data['Amount'].quantile(0.975))]
data = data[(data['Area'] >= data['Area'].quantile(0.01))]
data = data[(data['Area'] <= data['Area'].quantile(0.99))]
data = data[data['District'] != 'unknown']
data = data[data['District'].isna() == False]
data = data[data['Seller'].isna() == False]
data = data[data['Description'].isna() == False]

In [20]:
data = data.reset_index(drop=True)

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3464 entries, 0 to 3463
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         3464 non-null   object 
 1   City         3464 non-null   object 
 2   District     3464 non-null   object 
 3   Amount       3464 non-null   float64
 4   Currency     3464 non-null   object 
 5   Property     3464 non-null   object 
 6   Seller       3464 non-null   object 
 7   Area         3464 non-null   float64
 8   Rooms        3415 non-null   float64
 9   Bathrooms    1677 non-null   float64
 10  Parking      3464 non-null   object 
 11  Garden       3464 non-null   bool   
 12  Balcony      3464 non-null   bool   
 13  Terrace      3464 non-null   bool   
 14  Floor        3464 non-null   bool   
 15  New          3464 non-null   bool   
 16  Estate       3464 non-null   bool   
 17  Townhouse    3464 non-null   bool   
 18  Apartment    3464 non-null   bool   
 19  Land  

### Impute missing values

The next step is to fill in missing values for numeric columns `Amount` `Area` `Rooms` and `Bathrooms`. We use the KNN algorithm to accomplish this.

In [22]:
numeric = list(data.select_dtypes('number').columns)

In [23]:
mask = (data['Bathrooms'].isna() == True | data['Rooms'].isna())
missing = data[numeric]

imputer = KNNImputer(n_neighbors=5)
imputer.fit(missing)

missing = imputer.transform(missing)
missing = pd.DataFrame(missing, columns=numeric)

for col in numeric:
    data[col] = missing[col]
    
for col in numeric:
    data[col] = data[col].apply(lambda x: round(x))    

In [24]:
print(data.shape)

(3464, 24)


## Save data

Verify that there are no `NaN`s in data.

In [25]:
data.isnull().sum().sum()

0

In [26]:
data = data.drop(['Title', 
                  'Description', 
                  'Link', 
                  'Property', 
                  'City', 
                  'Currency', 
                  'Date'], axis=1)

In [27]:
data.head()

Unnamed: 0,District,Amount,Seller,Area,Rooms,Bathrooms,Parking,Garden,Balcony,Terrace,Floor,New,Estate,Townhouse,Apartment,Land,Studio
0,debniki,990000,realtor,93,4,2,street,False,False,False,False,False,False,False,False,False,False
1,pradnik bialy,401430,realtor,41,2,1,garage,False,True,True,False,True,False,False,False,True,False
2,krowodrza,439000,realtor,29,1,1,garage,False,True,False,True,False,True,False,False,False,True
3,krowodrza,520000,realtor,55,3,1,street,False,False,False,False,False,False,False,False,False,False
4,biezanow,287387,realtor,38,2,1,garage,False,True,False,False,False,False,False,False,False,False


In [28]:
data.to_csv('../flats-data/cleaned_data.csv', index=False)