In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [2]:
train = pd.read_csv('../data/raw/train.csv')

In [3]:
test = pd.read_csv('../data/raw/test.csv')

In [5]:
train["dt"] = pd.to_datetime(train.timestamp)

In [5]:
sample_submission = pd.read_csv("../data/raw/sample_submission.csv")

In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_squared_log_error

In [8]:
for i in train.columns:
    print(i)

id
timestamp
full_sq
life_sq
floor
max_floor
material
build_year
num_room
kitch_sq
state
product_type
sub_area
area_m
raion_popul
green_zone_part
indust_part
children_preschool
preschool_quota
preschool_education_centers_raion
children_school
school_quota
school_education_centers_raion
school_education_centers_top_20_raion
hospital_beds_raion
healthcare_centers_raion
university_top_20_raion
sport_objects_raion
additional_education_raion
culture_objects_top_25
culture_objects_top_25_raion
shopping_centers_raion
office_raion
thermal_power_plant_raion
incineration_raion
oil_chemistry_raion
radiation_raion
railroad_terminal_raion
big_market_raion
nuclear_reactor_raion
detention_facility_raion
full_all
male_f
female_f
young_all
young_male
young_female
work_all
work_male
work_female
ekder_all
ekder_male
ekder_female
0_6_all
0_6_male
0_6_female
7_14_all
7_14_male
7_14_female
0_17_all
0_17_male
0_17_female
16_29_all
16_29_male
16_29_female
0_13_all
0_13_male
0_13_female
raion_build_count_with_

In [6]:
from sklearn.linear_model import SGDRegressor

In [7]:
def reload_data():
    global train
    train = pd.read_csv('../data/raw/train.csv')
    global test
    test = pd.read_csv('../data/raw/test.csv')
    
def prepare_data(x):
    global train
    train = x(train)
    global test
    test = x(test)

In [8]:
prepare_data(lambda df: df[df['full_sq'] > df['full_sq'].quantile(0.0009)])

In [9]:
train['price_per_m2'] = train['price_doc'] / train['full_sq']

In [10]:

def comp_mse(df, features):
    kf = KFold(n_splits=5, shuffle=True) #random_state=42)
    target = "price_per_m2"

    msle_list = []
    mse_list = []
    for i, (train_index, val_index) in enumerate(kf.split(df)):    
        train_part = df.iloc[train_index, : ]
        val_part = df.iloc[val_index, :]
        model = SGDRegressor()
        model.fit(X=train_part[features].fillna(0), y = train_part[target])

        val_predication = model.predict(val_part[features].fillna(0)).clip(0, 100000000000)

        mse = mean_squared_error(val_predication, val_part[target])
        msle = mean_squared_log_error(val_predication, val_part[target])

        pirce_mse = mean_squared_error(val_predication * val_part['full_sq'], val_part['price_doc'])
        pirce_msle = mean_squared_log_error(val_predication * val_part['full_sq'], val_part['price_doc'])

        print(f'Fold full {i}: msle {pirce_msle}, mse {pirce_mse}')
        msle_list.append(pirce_msle)
        mse_list.append(pirce_mse)
        
    print(f'MSLE average = {np.mean(msle_list)}, std = {np.std(msle_list)}')

In [16]:
def predict_n_write_csv(features, path="predict1.csv"):
    target = "price_per_m2"
    
    model = SGDRegressor()
    model.fit(X=train[features].fillna(0.0), y = train[target])
    
    test[target] = model.predict(test[features].fillna(0))
    sample_submission = pd.read_csv("../data/raw/sample_submission.csv")
    sample_submission['price_doc'] = test[target] * test['full_sq']
    sample_submission.to_csv(path, index= False)

In [17]:
features = ["metro_km_walk"]
comp_mse(train, features)
predict_n_write_csv(features, "metro_km.csv")

Fold full 0: msle 0.5133199167295346, mse 15343898849725.553
Fold full 1: msle 1.6808024898150056, mse 12132674283675.207
Fold full 2: msle 1.597078854144389, mse 14207078234990.148
Fold full 3: msle 1.4558627630099745, mse 97490917835864.81
Fold full 4: msle 0.2980139424419969, mse 14631071274107.31
MSLE average = 1.1090155932281802, std = 0.582756549088432


In [18]:
features = ["kremlin_km"]
comp_mse(train, features)
predict_n_write_csv(features, "kremlin_km.csv")

Fold full 0: msle 0.48045524695548636, mse 10981955515967.537
Fold full 1: msle 0.2953508025836219, mse 15275392229570.418
Fold full 2: msle 0.8179578963501654, mse 14532533459117.771
Fold full 3: msle 0.581592315536502, mse 93161389718802.27
Fold full 4: msle 0.5497635399492442, mse 11939023647764.74
MSLE average = 0.545023960275004, std = 0.16873556003096896


In [20]:
features = ["metro_km_walk", "kremlin_km", "leisure_count_500"]
comp_mse(train, features)
predict_n_write_csv(features, "m_k_l.csv")

Fold full 0: msle 0.5991397060433135, mse 84444561436064.69
Fold full 1: msle 0.2764084020280069, mse 11534389626244.963
Fold full 2: msle 0.6680249197541068, mse 13713295464702.57
Fold full 3: msle 0.5055671192779019, mse 11759626613602.838
Fold full 4: msle 0.5144854689405217, mse 14348773751269.271
MSLE average = 0.5127251232087702, std = 0.1322765553531595


делаем финт ушами, чтобы для разных км были разные веса

In [21]:
import math
def preparator(df):
    min = 5
    max = math.ceil(df.kremlin_km.max())
    rez = df.copy()
    for i in range(min, max, 10):
        j = i - 10
        cp = df[df['kremlin_km'] < i]
        cp = cp[cp['kremlin_km'] > j]
        rez = rez.join(other=cp,rsuffix="_"+str(i))
    return rez

In [51]:
features = ['kremlin_km', 'leisure_count_500', 'metro_km_walk', 'product_type', 'green_zone_part', 'indust_part', 'trc_count_3000', 'children_preschool', 'preschool_quota', 'price_per_m2', 'price_doc', 'full_sq']
bdf = preparator(train[train["price_per_m2"] > 30000][features])

In [23]:
def get_features(df, a):
    lst = []
    for i in df:
        col = str(i)
        for st in a:
            if st in i and i != st:
                lst.append(i)
    return lst

In [64]:
features2 = get_features(bdf, ["kremlin_km", "trc_count_3000"])
features2+= ["leisure_count_500", "metro_km_walk","indust_part"]

comp_mse(bdf.fillna(0), features2)

Fold full 0: msle 0.14097921103406308, mse 7765917466283.686
Fold full 1: msle 0.12836416767264128, mse 9791094058611.018
Fold full 2: msle 0.13478689258429724, mse 8539109513027.845
Fold full 3: msle 0.14480024517788398, mse 6947524467894.512
Fold full 4: msle 0.15505091635811852, mse 9029325823979.844
MSLE average = 0.14079628656540083, std = 0.009054612224087578
