In [66]:
%matplotlib inline
%load_ext ipycache

import pandas as pd
import numpy as np
import scipy
import sklearn as sk
import xgboost as xgb

from eli5 import show_weights

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt

The ipycache extension is already loaded. To reload it, use:
  %reload_ext ipycache


In [67]:
train_raw = pd.read_csv("data/train.csv")
# train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [14]:
def preprocess_anomaly(df):
    df["full_sq"] = map(lambda x: x if x > 10 else float("NaN"), df["full_sq"])
    df["life_sq"] = map(lambda x: x if x > 5 else float("NaN"), df["life_sq"])
    df["kitch_sq"] = map(lambda x: x if x > 2 else float("NaN"), df["kitch_sq"])
#     full_sq-life_sq<0 full_sq-kitch_sq<0 life_sq-kitch_sq<0 floor-max_floor<0
    return df
train = preprocess_anomaly(train_raw)
train["material"] = train.material.fillna(0).astype(int).astype(str)
train["state"] = train.state.fillna(0).astype(int).astype(str)

In [17]:
cnts = []
for c in list(train.columns):
    cnts.append((c, len(train[c].value_counts())))

sorted(cnts, key=lambda x: x[1])[:50]

[('product_type', 2),
 ('culture_objects_top_25', 2),
 ('thermal_power_plant_raion', 2),
 ('incineration_raion', 2),
 ('oil_chemistry_raion', 2),
 ('radiation_raion', 2),
 ('railroad_terminal_raion', 2),
 ('big_market_raion', 2),
 ('nuclear_reactor_raion', 2),
 ('detention_facility_raion', 2),
 ('water_1line', 2),
 ('big_road1_1line', 2),
 ('railroad_1line', 2),
 ('mosque_count_500', 2),
 ('mosque_count_1000', 2),
 ('mosque_count_1500', 2),
 ('mosque_count_2000', 2),
 ('school_education_centers_top_20_raion', 3),
 ('mosque_count_3000', 3),
 ('mosque_count_5000', 3),
 ('university_top_20_raion', 4),
 ('build_count_foam', 4),
 ('cafe_count_500_price_high', 4),
 ('state', 5),
 ('ecology', 5),
 ('market_count_500', 5),
 ('culture_objects_top_25_raion', 6),
 ('material', 7),
 ('healthcare_centers_raion', 7),
 ('market_count_1000', 7),
 ('ID_railroad_terminal', 8),
 ('cafe_count_1000_price_high', 8),
 ('market_count_1500', 8),
 ('build_count_mix', 9),
 ('trc_count_500', 9),
 ('market_count_2

In [24]:
train.build_count_mix.value_counts()

0.0    20422
1.0     2066
2.0      999
5.0      872
4.0      518
9.0      237
3.0      173
8.0      149
6.0       44
Name: build_count_mix, dtype: int64

In [22]:
len(train)

30471

In [62]:
from sklearn.utils import shuffle

folds = np.array_split(shuffle(train), 3)

def smoothed_likelihood(targ_mean, nrows, globalmean, alpha=10):
    try:
        return (targ_mean * nrows + globalmean * alpha) / (nrows + alpha)
    except Exception:
        return float("NaN")
    

def mess_y_categorial(fold_raw, other_fold, cols=None, y_col="price_doc", alpha=10):
    fold = fold_raw.copy()
    if not cols:
        cols = list(fold.select_dtypes(include=["object"]).columns)
    globalmean = other_fold[y_col].mean()
    for c in cols:
        print c

        target_mean = other_fold[[c, y_col]].groupby(c).mean().to_dict()[y_col]
        nrows = other_fold[c].value_counts().to_dict()

        fold[c + "_sll"] = fold[c].apply(
            lambda x: smoothed_likelihood(target_mean.get(x), nrows.get(x), globalmean, alpha)
        )
    return fold

mess_y_categorial(folds[0], pd.concat(folds[1:]))[["sub_area", "sub_area_sll"]]

timestamp
material
product_type
sub_area
culture_objects_top_25
thermal_power_plant_raion
incineration_raion
oil_chemistry_raion
radiation_raion
railroad_terminal_raion
big_market_raion
nuclear_reactor_raion
detention_facility_raion
water_1line
big_road1_1line
railroad_1line
ecology


Unnamed: 0,sub_area,sub_area_sll
12280,Poselenie Krasnopahorskoe,4.653535e+06
11161,Sokolinaja Gora,7.391695e+06
12415,Golovinskoe,6.294542e+06
26337,Poselenie Desjonovskoe,4.458138e+06
19729,Juzhnoe Butovo,6.730091e+06
14234,Severnoe Butovo,7.753506e+06
9771,Vostochnoe Izmajlovo,5.770759e+06
16199,Nekrasovka,5.489649e+06
1218,Birjulevo Vostochnoe,5.543824e+06
19990,Poselenie Filimonkovskoe,3.466832e+06


In [36]:
len(folds)

3