In [1]:
def parse_params_entry(row, columns):

    """
    zmienna columns dostaje kolejne nazwy a zwracana jest lista z wartościami
    """

    entry = []
    unordered = {}
    for param in row.split("<br>"):
        
        key_value = param.split("<=>")

        # pomija parametry bez wartości
        no_value = len(key_value) == 1
        if (no_value):
            continue

        key = key_value[0]

        # niektóre zmienne kończą się na _types i mają wtedy wiele możliwości które się dodatkowo łączą
        # tutaj są zamieniane na zmienne binarne
        if (key.endswith("types")):
            
            if key_value[1].strip() == "" or key_value[1].strip() == "0":
                continue

            for value in key_value[1].split("<->"):

                real_key = key + "_" + value
                if real_key not in columns:
                    columns.append(real_key)
                unordered[real_key] = True
            continue
        
        # dodaje nazwę parametru do nagłówka
        value = key_value[1]
        if key not in columns:
            columns.append(key)

        unordered[key] = value

    for key in columns:
        entry.append(unordered[key] if key in unordered else None)    

    return entry

In [8]:
def read_data(path, param_name = "params", nchunks = None):

    """
    zbiera parametry w formacie <key><=>value<->value<->value<br>key><=>value<->... z DataFrame,
    >>> parse_params(df)
    id    price[currency]       m rooms_num     market  ... fence_types heating_types access_types vicinity_types is_bungalow
    325017             PLN   72.14         4  secondary  ...        None          None         None           None        None
    """

    import pandas

    param_cols = []
    csv_entries = []

    empty = pandas.read_csv(path, nrows=0).columns
    iterator = pandas.read_csv(path, chunksize=10000)
    for (k, chunk) in enumerate(iterator):

        if (nchunks is not None and k >= nchunks):
            break      
        for (i, row) in chunk.iterrows():
            
            entry = parse_params_entry(row[param_name], param_cols)
            csv_entries.append([*row.drop(param_name), *entry])
    
    from pandas import DataFrame

    return DataFrame(csv_entries, columns=[*empty.drop(param_name), *param_cols])

In [9]:
df = read_data('train.csv').drop(columns=['description', 'title'])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83


In [17]:
from pandas import get_dummies
df = get_dummies(df, columns=['category', 'market', 'floor_no', 'building_type', 'building_material'])
df.set_index('id', inplace=True)

In [32]:
df = get_dummies(df, columns=['building_material'])

In [18]:
def get_lon_lat_inplace(df, type = 'city', path = 'cities.csv'):
    
    import pandas
    
    cities = pandas.read_csv(path)[['id','lon','lat']]
    cities.rename(columns= {"id" : f'{type}_id', 'lon': f'{type}_lon', 'lat': f'{type}_lat'}, inplace=True)
    
    df = pandas.merge(df, cities, on = f'{type}_id', how='left')

get_lon_lat_inplace(df, 'city', 'cities.csv')
get_lon_lat_inplace(df, 'district', 'districts.csv')

In [34]:
df

Unnamed: 0_level_0,created_at_first,is_business,region_id,price,price[currency],m,rooms_num,building_floors_num,windows_type,heating,...,building_material_breezeblock,building_material_brick,building_material_cellular_concrete,building_material_concrete,building_material_concrete_plate,building_material_hydroton,building_material_other,building_material_reinforced_concrete,building_material_silikat,building_material_wood
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
325017,2018-10-06 12:20:48,1,14,389556,PLN,72.14,4,1,plastic,,...,0,0,0,0,0,0,0,0,0,0
513427,2019-02-18 12:17:51,1,15,500000,PLN,95,4,1,plastic,gas,...,0,1,0,0,0,0,0,0,0,0
824979,2019-03-21 17:31:09,1,10,238000,PLN,58.9,3,4,wooden,urban,...,0,0,0,0,1,0,0,0,0,0
400727,2018-11-28 23:50:45,1,15,209000,PLN,45,2,,plastic,,...,0,0,0,0,0,0,0,0,0,0
298324,2019-01-22 21:22:42,0,7,729000,PLN,195,6,,plastic,,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
932054,2018-09-01 08:32:13,1,6,218948,PLN,39.45,2,3,plastic,,...,0,1,0,0,0,0,0,0,0,0
457415,2018-03-08 22:41:41,1,11,600000,PLN,48,3,10,,urban,...,0,0,0,0,0,0,0,0,0,0
468577,2018-09-14 17:00:05,1,12,259532,PLN,56.42,3,3,plastic,urban,...,1,0,0,0,0,0,0,0,0,0
429669,2018-08-30 14:02:53,1,16,363400,PLN,51.85,2,3,plastic,urban,...,0,1,0,0,0,0,0,0,0,0


In [35]:
df.to_csv('dataset.csv')

## Braki

### Wartości prawie nieobecne

In [None]:
def drop_unique_not_NA(df):

  initial_columns = df.shape[1]
  df.dropna(thresh=2, axis=1, inplace=True)
  dropped_columns = initial_columns - df.shape[1]
  print("Liczba wyrzuconych kolumn:", dropped_columns)

drop_unique_not_NA(df)

In [36]:
from plotly.express import histogram
histogram(df.filter(like='equipment'))

: 

In [None]:
from plotly.express import histogram
histogram(df.filter(like='media'))

In [None]:
from plotly.express import histogram
histogram(df.filter(like='security'))

In [None]:
from plotly.express import histogram
histogram(df.filter(like='extras'))

## Ogarnianie

In [41]:
def handle_heating(df):

  df['heating_gas'] =  ((df['heating'] == 'gas') | df['heating_types_gas']).astype(int)
  df.drop(columns=['heating_types_gas'], inplace=True)

  df['heating_urban'] =  ((df['heating'] == 'urban') | df['heating_types_urban']).astype(int)
  df.drop(columns=['heating_types_urban'], inplace=True)

  other_heating_values = ['other', 'electric', 'boiler_room', 'tiled_stove']
  other_heating_cols = [
    'heating_types_fireplace', 
    'heating_types_electric',
    'heating_types_coal', 
    'heating_types_oil', 
    'heating_types_heat_pump',
    'heating_types_stove',
    'heating_types_solar_collector',
    'heating_types_biomass',
    'heating_types_geothermal',
    'extras_types_heating'
  ]

  df['heating_other'] =  (df['heating'].isin(other_heating_values) | df[other_heating_cols].any(axis = 1)).astype(int)
  df.drop(columns=other_heating_cols, inplace=True)

handle_heating(df)
histogram(df,
  x = 'heating_other',
  color='heating_urban',
  pattern_shape='heating_gas',
)

KeyError: 'heating_types_gas'