In this file step by step are described the transformations of the data made in the files:
* pipeline_pre-processing.py
* pipeline_for_training_data.ipynb
* model-building.ipynb
* pipeline_for_production.ipynb

# Importing

In [None]:
import pandas as pd
import numpy as np

from collections import Counter

import geocoder
import re

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

import joblib

import os

In [2]:
np.set_printoptions(suppress=False)

work_dir = r'C:\Users\User\Desktop\python-project-ApartmentPriceAnalysis'
os.chdir(work_dir)

In [65]:
data_initial = pd.read_csv('data_2024-01.csv', index_col=0).reset_index(drop = True)

# Data description
The data includes the following information:
1. **link** - link to the ad
2. **price** - the price in PLN given in the ad or "Zapytaj o cenę" ("Ask price") in case of no price given
3. **address** - the addres given in the ad
4. **area** - apartment area in m²
5. **num_rooms** - number of rooms in the apartment
6. **floor** - the floor on which the apartment is located, usually given in the form of floor by the number of floors in the entire building, for example, 1/7
7. **rent** - monthly rent value in PLN
8. **ownership_status** - form of ownership of the apartment, in Poland there are full ownership, cooperative ownership right to premises, cooperative tenant right to premises and right to municipal premises
9. **flat_condition** - condition of the apartment (to be moved in/to be finished/to be renovated)
10. **perks** - information whether the apartment has a balcony, garden or terrace
11. **parking** - whether the apartment has parking space
12. **heating** - type of heating of the apartment (municipal/gas/electric/boiler room/tiled stoves/other)
13. **market** - primary or secondary market
14. **ad_type** - advertiser type (real estate office/developer/private)
15. **availability** - date from when the apartment is available
16. **year** - year of building
17. **devel_type** - building type (Apartment block/Condominium/Townhouse/Row house etc.)
18. **windows** - material of windows in the apartment(plastic/wooden/aluminum)
19. **lift** - whether the apartment building has an elevator
20. **mater** - apartment building material (brick/hollow block/silicate/large slab etc.)
21. **utilities** - whether the apartment has Internet, cable TV, telephone
22. **security** - whether the apartment has intercom, video intercom, territory monitoring etc.
23. **equipment** - whether the apartment has dishwasher, refrigerator, furniture, oven etc.
24. **add_inf** - additional information, for example, whether the apartment has air conditioning, basement, separate kitchen etc.

Missing data are marked with "Zapytaj o cenę" ("Ask price"), "Zapytaj" ("Ask") and "brak informacji" ("no information")

# Pre-processing data transformation
Used in file: **pipeline_pre-processing.py**<br>
In the file above, a function with data preprocessing is defined.<br>
It consists of such functions as:
1. **standardize_missing_values**
2. **clean_numeric_columns**
3. **categorize_rent**
4. **process_floor_data**
5. **fill_missing_categoricals**
6. **encode_parking_presence**
7. **convert_year_to_int**
8. **standardize_ownership_labels**
9. **multiple_choice_transform**
10. **location_transform**
11. **city_info_transform**

### standardize_missing_values

In [None]:
def standardize_missing_values(data):
    
    """
    Replaces custom placeholders for missing values with standard NaN values.

    This function searches the DataFrame for specific strings that are used
    to indicate missing or unavailable information (e.g., 'Zapytaj o cenę',
    'Zapytaj', 'brak informacji') and replaces them with `np.NaN`, which is
    the standard missing value marker in pandas.

    Parameters:
    ----------
    data : pd.DataFrame
        The input DataFrame to be cleaned.

    Returns:
    -------
    pd.DataFrame
        A DataFrame with specified placeholder values replaced by NaN.
    """
    missing_placeholders = [
        'Zapytaj o cenę', # 'Ask price'
        'Zapytaj',        # 'Ask'
        'brak informacji' # 'no information'
        ]
    
    return data.replace(missing_placeholders, np.NaN)


In [118]:
standardized_missing_values_data = standardize_missing_values(data_initial)

missing_info = (
    standardized_missing_values_data.isna()
    .sum()
    .to_frame(name='missing_count')
    .assign(
        missing_percent=lambda df: round(100 * df['missing_count'] / len(standardized_missing_values_data), 2)
    )
    .sort_values(by='missing_count', ascending=False)
)

print(missing_info)

                  missing_count  missing_percent
availability              38287            81.84
equipment                 32486            69.44
rent                      25514            54.54
utilities                 21380            45.70
parking                   20529            43.88
add_inf                   19672            42.05
windows                   16767            35.84
mater                     16710            35.72
security                  14992            32.05
perks                     10959            23.43
ownership_status           8158            17.44
flat_condition             7465            15.96
heating                    6028            12.89
price                      2109             4.51
floor                       751             1.61
year                         26             0.06
devel_type                    9             0.02
lift                          0             0.00
link                          0             0.00
ad_type             

### clean_numeric_columns

In [None]:
def clean_numeric_columns(data):
    
    """
    Cleans and converts specified columns containing numeric values with extra characters to float type.

    This function is designed to process the columns "price", "area", and "rent" in a DataFrame
    where numeric values may be represented as strings containing non-numeric characters such as
    units (e.g., "m²"), letters, or whitespace. The steps include:

    1. Converting each value to string format to enable regex processing.
    2. Removing all alphabetic characters (including Polish-specific letters like 'ł', 'Ł', '²') and spaces.
    3. Replacing commas with dots to correctly format decimal numbers.
    4. Converting cleaned strings to numeric (float) values using `pd.to_numeric`.

    Parameters:
    ----------
    data : pd.DataFrame
        The input DataFrame that contains the columns "price", "area", and "rent".

    Returns:
    -------
    pd.DataFrame
        The modified DataFrame with "price", "area", and "rent" columns cleaned and converted to float.
    """
    
    for var in ["price", "area", "rent"]:
        
        data[var] = (
            data[var]
            .astype(str)                              
            .str.replace('[ a-zA-ZłŁ²]*', '', regex=True)
            .str.replace(',', '.', regex=False)
        )
        
        data[var] = pd.to_numeric(data[var])
        
    return data

In [120]:
cleaned_numeric_columns_data = clean_numeric_columns(standardized_missing_values_data)

clean_numeric_columns_show = pd.concat([data_initial[["price", "area", "rent"]].head(),
               cleaned_numeric_columns_data[["price", "area", "rent"]].head()],
              axis=1)

clean_numeric_columns_show.columns = ["price_before", "area_before", "rent_before",
                                      "price", "area", "rent"]
print(clean_numeric_columns_show)

   price_before area_before rent_before      price   area    rent
0    415 000 zł     37,4 m²     Zapytaj   415000.0  37.40     NaN
1    880 000 zł     68,5 m²      750 zł   880000.0  68.50   750.0
2    590 000 zł       60 m²        1 zł   590000.0  60.00     1.0
3    699 000 zł       77 m²     Zapytaj   699000.0  77.00     NaN
4  1 378 000 zł    69,03 m²    1 120 zł  1378000.0  69.03  1120.0


### categorize_rent

In [None]:
def categorize_rent(data):
    
    """
    Categorizes rental prices into discrete bins and creates a new column 'rent_cat'.

    This function groups the values from the 'rent' column into five predefined ranges (bins)
    to simplify analysis or modeling. Each bin is assigned a numeric label from 1 to 5.
    The bins are: 
        - 0 to 500
        - 501 to 1000
        - 1001 to 1500
        - 1501 to 2000
        - 2001 and above

    The original 'rent' column is removed after categorization, and the new 'rent_cat'
    column is added with float-typed values.

    Parameters:
    ----------
    data : pd.DataFrame
        The input DataFrame that contains a 'rent' column with numeric values.

    Returns:
    -------
    pd.DataFrame
        The modified DataFrame with the 'rent' column replaced by a categorical 'rent_cat' column.
    """
    
    # Group the values from the 'rent' column into ranges
    data['rent_cat'] = pd.cut(data['rent'],
                              bins = [0, 500, 1000, 1500, 2000, np.inf],
                              labels = np.arange(1, 6, 1))
    data['rent_cat'] = data['rent_cat'].astype("float")
    
    # Drop original 'rent' column
    data = data.drop('rent', axis=1)
    
    return data

In [123]:
categorized_rent_data = categorize_rent(cleaned_numeric_columns_data)

categorize_rent_show = pd.concat([data_initial[["rent"]].head(7),
                                  categorized_rent_data[["rent_cat"]].head(7)],
                                 axis=1)

categorize_rent_show.columns = ["rent_before", "rent_cat"]
print(categorize_rent_show.dropna())

  rent_before  rent_cat
1      750 zł       2.0
2        1 zł       1.0
4    1 120 zł       3.0
5      750 zł       2.0
6      700 zł       2.0


### process_floor_data

In [None]:
def process_floor_data(data):
    
    """
    Process floor information in the dataset by extracting and standardizing floor-related features.

    This function performs the following steps:
    1. Extracts the total number of floors in the building from the 'floor' column.
       - Assumes the format is 'apartment_floor/number_of_floors'.
       - If the format does not contain '/', sets the value to NaN.
    2. Extracts the apartment's floor number from the 'floor' column.
       - Converts special floor names ('parter', 'suterena') to '0'.
       - Replaces '> 10' with None (to handle inconsistent data).
    3. If the apartment's floor is labeled as 'poddasze' (attic), replaces it with the total number of floors.
    4. Converts the apartment floor and total floors columns to float type.
    5. Removes the original 'floor' column from the dataframe.

    Parameters:
    ----------
    data : pandas.DataFrame
        The input dataframe containing a 'floor' column with floor information.

    Returns:
    -------
    pandas.DataFrame
        The dataframe with two new columns:
        - 'number_floor_in_building': total floors in the building as float.
        - 'ap_floor': apartment floor number as float.
        The original 'floor' column is dropped.
    """
    
    # Extract apartment floor (left part before '/'), convert special names
    data['number_floor_in_building'] = data['floor'].apply(lambda x: str(x).split('/')[1] if str(x).__contains__('/') else np.NaN).astype('float')
    data['ap_floor'] = data['floor'].apply(lambda x: str(x).split('/')[0]).replace({'parter':'0',
                                                                                    'suterena':'0',
                                                                                    '> 10': None})
    # Replace 'poddasze' (attic) with total floors in building
    data['ap_floor'] = np.where(data['ap_floor'] == 'poddasze',
                                        data['number_floor_in_building'], 
                                        data['ap_floor'])
    # Convert apartment floor to float
    data['ap_floor'] = pd.to_numeric(data['ap_floor'])
    
    # Drop original 'floor' column
    data = data.drop('floor', axis=1)
    
    return data

In [145]:
processed_floor_data = process_floor_data(categorized_rent_data)

process_floor_data_show = pd.concat([data_initial[["floor"]].head(),
                                  processed_floor_data[["ap_floor", "number_floor_in_building"]].head()],
                                 axis=1)

print(process_floor_data_show)

      floor  ap_floor  number_floor_in_building
0       1/3       1.0                       3.0
1       4/7       4.0                       7.0
2  parter/1       0.0                       1.0
3    parter       0.0                       NaN
4       3/4       3.0                       4.0


### fill_missing_categoricals

In [138]:
def fill_missing_categoricals(data):
    cols = ['ownership_status', 'flat_condition', 'heating', 'windows', 'mater', 'devel_type']
    data[cols] = data[cols].fillna('nie podano')
    return data

In [146]:
fill_missing_categoricals_data = fill_missing_categoricals(processed_floor_data)

missing_info = (
    fill_missing_categoricals_data.isna()
    .sum()
    .to_frame(name='missing_count')
    .assign(
        missing_percent=lambda df: round(100 * df['missing_count'] / len(fill_missing_categoricals_data), 2)
    )
    .sort_values(by='missing_count', ascending=False)
)

print(missing_info)

                          missing_count  missing_percent
availability                      38287            81.84
equipment                         32486            69.44
rent_cat                          25514            54.54
utilities                         21380            45.70
parking                           20529            43.88
add_inf                           19672            42.05
security                          14992            32.05
perks                             10959            23.43
number_floor_in_building           2238             4.78
price                              2109             4.51
ap_floor                           1235             2.64
year                                 26             0.06
ad_type                               0             0.00
market                                0             0.00
devel_type                            0             0.00
windows                               0             0.00
lift                           

### encode_parking_presence

In [143]:
def encode_parking_presence(data):
    data['parking_coded'] = data['parking'].apply(lambda x: 0 if pd.isna(x) else 1)
    return data

In [149]:
encoded_parking_presence_data = encode_parking_presence(fill_missing_categoricals_data)

encode_parking_presence_show = pd.concat([data_initial[["parking"]].head(6),
                                  encoded_parking_presence_data[["parking_coded"]].head(6)],
                                 axis=1)

print(encode_parking_presence_show)

                    parking  parking_coded
0  garaż/miejsce parkingowe              1
1  garaż/miejsce parkingowe              1
2  garaż/miejsce parkingowe              1
3  garaż/miejsce parkingowe              1
4                   Zapytaj              0
5                   Zapytaj              0


### convert_year_to_int

In [151]:
def convert_year_to_int(data):
    data['year'] = data['year'].apply(lambda x: int(x) if isinstance(x, str) else x)
    return data

In [156]:
converted_year_to_int_data = convert_year_to_int(encoded_parking_presence_data)
print("Descrition before:")
print(data_initial['year'].describe())
print()
print("Descrition after:")
print(converted_year_to_int_data['year'].describe())

Descrition before:
count     46782
unique      207
top        2023
freq       9691
Name: year, dtype: object

Descrition after:
count    46756.000000
mean      1998.217747
std         83.953663
min          1.000000
25%       1983.000000
50%       2020.000000
75%       2023.000000
max       2027.000000
Name: year, dtype: float64


### standardize_ownership_labels

In [158]:
def standardize_ownership_labels(data):
    data['ownership_status'] = data['ownership_status'].apply(
        lambda x: 'spółdzielcze wł. prawo do lokalu' if x == 'spółdzielcze własnościowe' else x
    )
    return data

In [169]:
standardized_ownership_labels_data = standardize_ownership_labels(converted_year_to_int_data)

print("Value counts before:")
print(data_initial[['ownership_status']].value_counts())
print()
print("Value counts  after:")
print(standardized_ownership_labels_data[['ownership_status']].value_counts())

Value counts before:
ownership_status                 
pełna własność                       36790
Zapytaj                               8158
spółdzielcze wł. prawo do lokalu      1576
udział                                 171
użytkowanie wieczyste / dzierżawa       86
spółdzielcze własnościowe                1
dtype: int64

Value counts  after:
ownership_status                 
pełna własność                       36790
nie podano                            8158
spółdzielcze wł. prawo do lokalu      1577
udział                                 171
użytkowanie wieczyste / dzierżawa       86
dtype: int64


### multiple_choice_transform

In [173]:
def items_of_var(var, data = standardized_ownership_labels_data):   
    item_list = []
    for items in data[var]:
        present = str(items).split(',')
        present = [x.strip(' ') for x in present]
        item_list.extend(present)

    item_list = list(dict.fromkeys(item_list))
    if 'nan' in item_list:
        item_list.remove('nan')
    return item_list

perklist = items_of_var('perks')
utilitylist = items_of_var('utilities')
securitylist = items_of_var('security')
equipmentlist = items_of_var('equipment')
additionallist = items_of_var('add_inf')

var_values_dict = {'utilities': utilitylist,
                   'security': securitylist,
                   'equipment': equipmentlist,
                   'add_inf': additionallist,
                   'perks': perklist}

for var in var_values_dict:
    print(f'{var}: {var_values_dict[var]}')

utilities: ['telewizja kablowa', 'internet', 'telefon']
security: ['drzwi / okna antywłamaniowe', 'teren zamknięty', 'domofon / wideofon', 'monitoring / ochrona', 'rolety antywłamaniowe', 'system alarmowy']
equipment: ['zmywarka', 'lodówka', 'meble', 'piekarnik', 'kuchenka', 'pralka', 'telewizor']
add_inf: ['pom. użytkowe', 'piwnica', 'dwupoziomowe', 'oddzielna kuchnia', 'klimatyzacja']
perks: ['balkon', 'taras', 'ogródek']


In [203]:
var_values_dict_path = "1. Data Preparation/multiple_choice_var_dict.joblib"
joblib.dump(var_values_dict, var_values_dict_path)

['1. Data Preparation/multiple_choice_var_dict.joblib']

In [174]:
def splitcolumn(serieslike, colname, items, missing_categories):
    input_value = serieslike[colname]
    
    if pd.isna(input_value):
        return pd.Series([0 for x in items])
    
    present = input_value.split(',')
    present = [x.strip(' ') for x in present]
    present = [x for x in present if len(x) > 1]
    
    presences = [1 if x in present else 0 for x in items]
    
    new_items = [x for x in present if x not in items]
    if new_items:
        missing_categories.extend(new_items)
    
    return pd.Series(presences)

In [204]:
def multiple_choice_transform(data, train_dataset):
    
    var_values_dict = joblib.load("1. Data Preparation/multiple_choice_var_dict.joblib")
    
    missing_categories = []
    
    for key in var_values_dict:
        column_names = [key + '_' + x for x in var_values_dict[key]]
        data[column_names] = data.apply(splitcolumn, args=(key, var_values_dict[key], missing_categories), axis=1)
        
    data = data.drop(var_values_dict, axis=1)
    
    if train_dataset:
        if missing_categories:
            category_counts = Counter(missing_categories)
            print("There are new categories that are not in the dictionary:")
            for category, count in category_counts.items():
                print(f"  - {category}: {count} razy")
        else:
            print("All the categorizations occurring in the set in multi-vector selection variables were coded.")
    
    return data

In [190]:
multiple_choice_transformed_data = multiple_choice_transform(standardized_ownership_labels_data, var_values_dict, train_dataset = True)

vars_before = ['utilities', 'security', 'equipment', 'add_inf', 'perks']
vars_after = ['utilities_telewizja kablowa', 'security_domofon / wideofon',
              'equipment_zmywarka', 'add_inf_piwnica', 'perks_balkon']

multiple_choice_transform_show = pd.concat([data_initial[vars_before].head(6),
                                  multiple_choice_transformed_data[vars_after].head(6)],
                                 axis=1)

print(multiple_choice_transform_show[['utilities', 'utilities_telewizja kablowa',
                                     'security', 'security_domofon / wideofon',
                                     'equipment', 'equipment_zmywarka',
                                     'add_inf', 'add_inf_piwnica',
                                     'perks', 'perks_balkon']])

All the categorizations occurring in the set in multi-vector selection variables were coded.
                              utilities  utilities_telewizja kablowa  \
0  telewizja kablowa, internet, telefon                            1   
1           telewizja kablowa, internet                            1   
2           telewizja kablowa, internet                            1   
3                              internet                            0   
4  telewizja kablowa, internet, telefon                            1   
5  telewizja kablowa, internet, telefon                            1   

                                            security  \
0  drzwi / okna antywłamaniowe, teren zamknięty, ...   
1  drzwi / okna antywłamaniowe, teren zamknięty, ...   
2                teren zamknięty, domofon / wideofon   
3  drzwi / okna antywłamaniowe, teren zamknięty, ...   
4  drzwi / okna antywłamaniowe, domofon / wideofo...   
5                                 domofon / wideofon   

   securi

### location_transform

In [184]:
def address_transform(addressline, cities_dict):
    if pd.isna(addressline):
        region = np.nan
        location = np.nan
        street = np.nan

        return pd.Series([region, location, street])
    
    region = addressline.split(',')[-1].strip(' ')
    city_in_region = cities_dict[region]
        
    if addressline.split(',')[-2].strip(' ') in city_in_region:
        location = addressline.split(',')[-2].strip(' ')
            
    elif addressline.split(',')[-3].strip(' ') in city_in_region:
        location = addressline.split(',')[-3].strip(' ')
            
    else:
        location = np.nan

    if addressline.split(',')[0].strip(' ') == location:
        street = np.nan
            
    else:
        street = addressline.split(',')[0].strip(' ')
        street = street.removeprefix('ul. ')
        street = street.removeprefix('al. ')
        street = street.removeprefix('pl. ')
        street = street.strip(' ')

    return pd.Series([region, location, street])

In [185]:
def location_transform(data):
    
    locations = pd.read_csv('1. Data Preparation/locations_and_regions.csv')
    
    # Przetwarzanie danych o miejscowościach w Polsce na słownik województwo-miejscowości
    # służący do przypisywania miejscowości adresom
    locations = locations[locations['Rodzaj'].isin(['wieś','miasto','osada','kolonia','osada leśna'])]
    locations = locations.groupby('Województwo',axis=0)

    cities_dict = {}
    for reg in locations:
        cities_dict[reg[0]] = list(reg[1]['Nazwa miejscowości'])
    cities_dict['zachodniopomorskie'].append('Stargard')
    
    
 
    data[['region', 'location', 'street/district']] = data['address'].apply(address_transform, args = [cities_dict])
    
    return data

In [191]:
location_transformed_data = location_transform(multiple_choice_transformed_data)

location_transform_var = ['address', 'region', 'location', 'street/district']

print(location_transformed_data[location_transform_var].info())
location_transformed_data[location_transform_var].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46782 entries, 0 to 46781
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   address          46782 non-null  object
 1   region           46782 non-null  object
 2   location         46726 non-null  object
 3   street/district  41415 non-null  object
dtypes: object(4)
memory usage: 1.4+ MB
None


Unnamed: 0,address,region,location,street/district
0,"ul. Henryka Strobanda, Wrzosy, Toruń, kujawsko...",kujawsko-pomorskie,Toruń,Henryka Strobanda
1,"łąkowa 27 B, Stare Polesie, Polesie, Łódź, łód...",łódzkie,Łódź,łąkowa 27 B
2,"ul. Błękitna, Marki, wołomiński, mazowieckie",mazowieckie,Marki,Błękitna
3,"Szyce, Wielka Wieś, krakowski, małopolskie",małopolskie,Wielka Wieś,Szyce
4,"ul. Rakowiecka 43A, Stary Mokotów, Mokotów, Wa...",mazowieckie,Warszawa,Rakowiecka 43A


### city_info_transform

In [199]:
def city_info_transform (data):
        
    locations_data = pd.read_excel("1. Data Preparation/locations_info.xlsx")
    
    locations_data['pop_numb_cat'] = pd.cut(locations_data['Liczba ludności'],
                                          bins = [0,10000,20000,50000,100000,
                                                  250000,500000,1000000,2000000],
                                          labels = np.arange(1,9,1))
    
    locations_data['pop_dens_cat'] = pd.cut(locations_data['Gęstość zaludnienia'], bins = range(0,4001,500),
                                      labels = np.arange(0,8,1))
    
    data_merged = data.loc[:,'link':'street/district'].merge(locations_data,
                                                             left_on=['location', 'region'],
                                                             right_on=['Miasto', 'Województwo'],
                                                             how='left')

    data_merged['with_powiat_rights'] = data_merged['na_prawach_powiatu'].fillna(0)

    data_merged['pop_numb_cat'] = pd.to_numeric(data_merged['pop_numb_cat'])
    data_merged['pop_dens_cat'] = pd.to_numeric(data_merged['pop_dens_cat'])

    data_merged['pop_numb_cat'].fillna(0, inplace = True)
    data_merged['pop_dens_cat'].fillna(0, inplace = True)
    
    data_merged.drop(['Miasto', 'Powiat', 'Województwo', 'Powierzchnia',
                      'Liczba ludności','Gęstość zaludnienia','na_prawach_powiatu'],
                     axis=1, inplace = True)
    
    return data_merged

In [192]:
city_info_transformed_data = city_info_transform(location_transformed_data)
location_transform_var = ['address', 'region','location','street/district',
                          'with_powiat_rights', 'pop_numb_cat',
                          'pop_dens_cat']
city_info_transformed_data[location_transform_var].head()

Unnamed: 0,address,region,location,street/district,with_powiat_rights,pop_numb_cat,pop_dens_cat
0,"ul. Henryka Strobanda, Wrzosy, Toruń, kujawsko...",kujawsko-pomorskie,Toruń,Henryka Strobanda,1.0,5.0,3.0
1,"łąkowa 27 B, Stare Polesie, Polesie, Łódź, łód...",łódzkie,Łódź,łąkowa 27 B,1.0,7.0,4.0
2,"ul. Błękitna, Marki, wołomiński, mazowieckie",mazowieckie,Marki,Błękitna,0.0,3.0,2.0
3,"Szyce, Wielka Wieś, krakowski, małopolskie",małopolskie,Wielka Wieś,Szyce,0.0,0.0,0.0
4,"ul. Rakowiecka 43A, Stary Mokotów, Mokotów, Wa...",Warszawa,Warszawa,Rakowiecka 43A,1.0,8.0,6.0


### preliminary_transform

In [205]:
def preliminary_transform (data, train_dataset):
    standardized_missing_values_data = standardize_missing_values(data)
    cleaned_numeric_columns_data = clean_numeric_columns(standardized_missing_values_data)
    categorized_rent_data = categorize_rent(cleaned_numeric_columns_data)
    processed_floor_data = process_floor_data(categorized_rent_data)
    fill_missing_categoricals_data = fill_missing_categoricals(processed_floor_data)
    encoded_parking_presence_data = encode_parking_presence(fill_missing_categoricals_data)
    converted_year_to_int_data = convert_year_to_int(encoded_parking_presence_data)
    standardized_ownership_labels_data = standardize_ownership_labels(converted_year_to_int_data)
    multiple_choice_transformed_data = multiple_choice_transform(standardized_ownership_labels_data,
                                                                 train_dataset)
    location_transformed_data = location_transform(multiple_choice_transformed_data)
    city_info_transformed_data = city_info_transform(location_transformed_data)
    return city_info_transformed_data

In [207]:
preliminary_transformed_data = preliminary_transform(data_initial, True)

All the categorizations occurring in the set in multi-vector selection variables were coded.


In [209]:
preliminary_transformed_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46782 entries, 0 to 46781
Data columns (total 51 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   link                                  46782 non-null  object 
 1   price                                 44673 non-null  float64
 2   address                               46782 non-null  object 
 3   area                                  46782 non-null  float64
 4   num_rooms                             46782 non-null  int64  
 5   ownership_status                      46782 non-null  object 
 6   flat_condition                        46782 non-null  object 
 7   parking                               26253 non-null  object 
 8   heating                               46782 non-null  object 
 9   market                                46782 non-null  object 
 10  ad_type                               46782 non-null  object 
 11  availability   

# Missing and outlier observations
Analysis of missing and outlier observations was conducted in the file: **missvalue_outliers_analysis** <br>
The file shows the distributions of each variable, establishes the variables taken into the model, and sets limits beyond which an observation will be considered an outlier. <br><br>

In summary, the model wiil be trained on the data of apartments which:<br>
1. price is in the range of 100 thousand PLN to 1 million PLN
2. area is in the range of 20 m² to 150 m²
3. is in a building that has less than 20 floors
4. is in a building that was built no earlier than 1900

Variables were **excluded** from further analysis:
1. link
2. availability
3. street/distric

In addition, in some categorical variables, categories that occurred less frequently than 5% of the set were converted to “other” ("inny" in Polish) categories. These were the variables:
1. **ownership_status** - 'spółdzielcze wł. prawo do lokalu', 'udział', 'użytkowanie wieczyste / dzierżawa'
2. **heating** - 'kotłownia', 'elektryczne', 'piece kaflowe'
3. **devel_type** - 'plomba', 'loft', 'dom wolnostojący', 'szeregowiec'
4. **windows** - 'drewniane', 'aluminiowe'
5. **mater** - 'drewno', 'keramzyt', 'beton', 'beton komórkowy', 'żelbet'

In [None]:
outlier_values = {"max_floor": 20,
                  "min_price": 100000,
                  "max_price": 1000000,
                  "min_area": 20,
                  "max_area": 150,
                  "min_year": 1900,
                  "categories_to_replace": ['spółdzielcze wł. prawo do lokalu', 'udział', 'użytkowanie wieczyste / dzierżawa',
                                            'kotłownia', 'elektryczne', 'piece kaflowe',
                                            'plomba', 'loft', 'dom wolnostojący', 'szeregowiec',
                                            'drewniane', 'aluminiowe',
                                            'drewno', 'keramzyt', 'beton', 'beton komórkowy', 'żelbet',
                                            'inne', 'inny']}

In [None]:
def outlier_drop(data, train_dataset,
                 maxfloor,
                 minprice, maxprice,
                 minarea, maxarea,
                 minyear, categories_to_replace):

    data = data.replace(categories_to_replace, 'other')
    
    data_clean = data[data['price'] <= maxprice]
    data_clean = data_clean[data_clean['price'] >= minprice]
    
    data_clean = data_clean[data_clean['area'] <= maxarea]
    data_clean = data_clean[data_clean['area'] >= minarea]
    
    data_clean = data_clean[(data_clean['number_floor_in_building'] <= maxfloor)|(data_clean['number_floor_in_building'].isna())]
    data_clean = data_clean[(data_clean['ap_floor'] <= maxfloor)|(data_clean['ap_floor'].isna())]
    
    data_clean['year'] = data_clean['year'].apply(lambda x: x if x >= minyear else np.nan)
    
    if train_dataset:
        return data_clean
    else:
        if len(data) > (len(data_clean) + sum(data['price'].isna())):
            print("""
                  W zbiorze wystąpiły obserwację o skrajnych wartościach
                  ze względu na cenę, powierzchnię, piętro lub rok budynku.
                  Może zmniejszeć dokładność prognozy
                  """)
        return data