In [23]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [24]:
data = pd.read_csv('train.csv')

In [25]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [26]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price'],
      dtype='object')

In [43]:
# датасет с информацией о распределении квартир от общего числа квартир в train по району
distr_info1 = train['DistrictId'].value_counts(normalize=True).reset_index().rename(columns={'index':'DistrictId', 
                                                                               'DistrictId': 'flat_qty_distr'})

In [44]:
# Найдем минимальное количество квартир в районе: 
distr_info1['flat_qty_distr'].min()

0.00014285714285714287

In [45]:
def add_district_info1(df, distr_info1):
    df = pd.merge(df, dist_info1, on='DistrictId', how='left')
    df['flat_qty_distr'] = df['flat_qty_distr'].fillna(dist_info1['flat_qty_distr'].min())
    return df

In [46]:
distr_stat_dr = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().\
    rename(columns={'Price':'mean_price_dr'})
    
distr_stat_r = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_r'})

mean_price = train['Price'].mean()

In [47]:
def add_stats(df, distr_stat_dr, distr_stat_r, mean_price):
    df = pd.merge(df, distr_stat_dr, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, distr_stat_r, on='Rooms', how='left')
    
    df['mean_price_r'] = df['mean_price_r'].fillna(mean_price)
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    return df

In [48]:
def add_cat_fts(df, cat_fts=('Ecology_2', 'Ecology_3', 'Shops_2')):
    for col in cat_fts:
        df[col] = (df[col] == 'B').astype(int)
    return df

In [49]:
def fillna_healthcare_1(df):
    df['Healthcare_1'] = df['Healthcare_1'].fillna(0)
    return df

In [50]:
def prepare_rooms(df, source_df):
    df.loc[df['Rooms'] > 5, 'Rooms'] = source_df['Rooms'].median()
    return df

In [51]:
def fillna_life_square(df, source_df):
    df['LifeSquare'] = df['LifeSquare'].fillna(source_df['LifeSquare'].mean())
    return df

In [52]:
def prepare_data(df, distr_info1, distr_stat_dr, distr_stat_r, mean_price, source_df):
    df = add_district_info1(df, distr_info1)
    df = add_stats(df, distr_stat_dr, distr_stat_r, mean_price)
    df = add_cat_fts(df)
    df = fillna_healthcare_1(df)
    df = prepare_rooms(df, source_df)
    df = fillna_life_square(df, source_df)
    return df

In [53]:
train = prepare_data(train, distr_info1, distr_stat_dr, distr_stat_r, mean_price, train)

In [55]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,flat_qty_distr,mean_price_dr,mean_price_r
0,14604,23,1.0,41.68138,22.796166,8.0,14,17.0,2015,0.075779,...,1437,3,0.0,0,2,1,88504.384965,0.056286,102427.030975,160134.810901
1,5621,23,3.0,163.495333,161.504222,12.0,5,3.0,1977,0.014073,...,475,0,0.0,0,0,1,207007.956663,0.056286,165911.1297,290867.452543
2,235,87,1.0,39.710131,19.538663,8.0,4,17.0,1986,0.100456,...,7227,0,0.0,1,6,0,182126.280899,0.003,169596.630515,160134.810901
3,16258,48,3.0,96.056784,98.152802,1.0,15,1.0,2017,0.041125,...,9515,5,0.0,1,10,1,524365.550705,0.008857,382424.639356,290867.452543
4,10773,77,3.0,79.195825,44.421062,10.0,16,17.0,1984,0.298205,...,4048,3,0.0,1,3,1,322048.43399,0.004,251751.766701,290867.452543


In [56]:
valid = prepare_data(valid, distr_info1, distr_stat_dr, distr_stat_r, mean_price, train)

In [57]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor as RF

In [152]:
feats = ['Rooms', 'Square', 'flat_qty_distr', 'mean_price_dr', 'Healthcare_1','HouseYear', 'LifeSquare']

In [153]:
model = LR()

In [154]:
model.fit(train.loc[:, feats], train['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [155]:
pred_train = model.predict(train.loc[:, feats])

In [156]:
pred_valid = model.predict(valid.loc[:, feats]) 

In [157]:
pred_train.shape, pred_valid.shape

((7000,), (3000,))

In [158]:
from sklearn.metrics import r2_score as r2, mean_squared_error as mse

In [159]:
r2(train['Price'], pred_train), mse(train['Price'], pred_train)

(0.7023312924223353, 2570007520.1083436)

In [160]:
r2(valid['Price'], pred_valid), mse(valid['Price'], pred_valid)

(0.5599407347451324, 3785281985.404885)

In [161]:
test = pd.read_csv('test.csv')

In [162]:
test.shape

(5000, 19)

In [163]:
test = prepare_data(test, distr_info1, distr_stat_dr, distr_stat_r, mean_price, train)

In [164]:
test['Price'] = model.predict(test.loc[:, feats])

In [165]:
test['Price'].describe()

count      5000.000000
mean     214822.089634
std       74508.794280
min       66573.067117
25%      165919.557072
50%      197474.017041
75%      249536.223390
max      629754.291586
Name: Price, dtype: float64

In [166]:
test.loc[:, ['Id', 'Price']].to_csv('AKonysheva_predictions.csv', index=None)

In [167]:
model_RF = RF(n_estimators=1000, max_depth=12, random_state=42)

In [168]:
model_RF.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [169]:
pred_train_RF = model_RF.predict(train.loc[:, feats])
pred_valid_RF = model_RF.predict(valid.loc[:, feats]) 

In [170]:
r2(train['Price'], pred_train_RF), mse(train['Price'], pred_train_RF)

(0.9213885442729893, 678714380.2556721)

In [171]:
r2(valid['Price'], pred_valid_RF), mse(valid['Price'], pred_valid_RF)

(0.6832733555578353, 2724404997.60936)