In [28]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 100

In [29]:
data = pd.read_csv('train.csv')

In [30]:
def add_pseudos(data, feature_list, y_name):
    for feature in feature_list:
        print(feature)
        feature_pseudo = data.groupby([feature], as_index=False)[[y_name]].mean()
        feature_pseudo.rename(columns={y_name: 'pseudovar'}, inplace=True)
        
        data[f'{feature}_pseudo'] = np.nan
        data.loc[:, f'{feature}_pseudo'] = data.merge(feature_pseudo, how='left', on=feature)['pseudovar']
        
        print(f'{feature}_pseudo was added')
    return data

In [31]:
def add_test_pseudos(test, data, feature_list, y_name):
    for feature in feature_list:
        print(feature)
        feature_pseudo = data.groupby([feature], as_index=False)[[y_name]].mean()
        feature_pseudo.rename(columns={y_name: 'pseudovar'}, inplace=True)
        
        test[f'{feature}_pseudo'] = np.nan
        test.loc[:, f'{feature}_pseudo'] = test.merge(feature_pseudo, how='left', on=feature)['pseudovar']
        
        print(f'{feature}_pseudo was added')
    return test

In [32]:
def show_cm(data, features_list):
    cm = np.corrcoef(pd.DataFrame(np.hstack([data[features_list], data[['Price',]]]), columns=features_list + ['Price']).values.T)

    fig, ax = plt.subplots(figsize=(9,9))  
    sns.set(font_scale=1)
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', 
                     annot_kws={'size' : 9.5}, 
                     yticklabels=features_list + ['Price'], 
                     xticklabels=features_list + ['Price'])
    plt.show()

In [33]:
def prepare_train_data(data):
    data.loc[data['Rooms'] == 19, 'Rooms'] = 1
    data.loc[data['Rooms'] == 6, 'Rooms'] = 2
    data.loc[data['Rooms'] == 10, 'Rooms'] = 2

    data.loc[data['Id'].isin([28, 2307, 11602]), 'Square'] = data['Square'] / 10
    data.loc[data['Id'].isin([28, 2307, 11602]), 'LifeSquare'] = data['LifeSquare'] / 10
    data.loc[data['LifeSquare'] > 1000, 'LifeSquare'] = data['LifeSquare'] / 100

    data.loc[data['Id'].isin([14990, 15886]), 'LifeSquare'] = np.NaN
    data.loc[data['LifeSquare'] > data['Square'], 'LifeSquare'] = np.NaN

    sup_df = data[['Rooms', 'Square', 'LifeSquare']]
    sup_df['ls_percent'] = data['LifeSquare'] / data['Square']
    sup_df = sup_df.groupby(['Rooms'], as_index=False)['ls_percent'].median()
    data.loc[data['LifeSquare'].isna(), 'new_LifeSquare'] = data.loc[data['LifeSquare'].isna(), ['Rooms', 'Square']].merge(sup_df, how='left', on='Rooms')['ls_percent'].values * data.loc[data['LifeSquare'].isna(), ['Rooms', 'Square']].merge(sup_df, how='left', on='Rooms')['Square'].values
    data.loc[~data['LifeSquare'].isna(), 'new_LifeSquare'] = data.loc[~data['LifeSquare'].isna(), 'LifeSquare']

    data.loc[data['HouseYear'] == 20052011, 'HouseYear'] = (2005 + 2011)/2
    data.loc[data['HouseYear'] == 4968, 'HouseYear'] = data['HouseYear'].median()
    
    # data.loc[:, 'Price_log'] = np.log(data['Price'])

    data = add_pseudos(data, ['Ecology_2', 'Ecology_3', 'DistrictId', 'Helthcare_2', 'Social_2', 'Shops_2'], 'Price')  # можно удалить "data = "
    
    return data

In [34]:
data_prepared = prepare_train_data(data)

Ecology_2
Ecology_2_pseudo was added
Ecology_3
Ecology_3_pseudo was added
DistrictId
DistrictId_pseudo was added
Helthcare_2
Helthcare_2_pseudo was added
Social_2
Social_2_pseudo was added
Shops_2
Shops_2_pseudo was added


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [35]:
features_list = ['Rooms', 'Square', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'new_LifeSquare',
       'DistrictId_pseudo', 'Ecology_2_pseudo', 'Ecology_3_pseudo', 'Helthcare_2_pseudo', 'Social_2', 'Social_3', 'Shops_1', 'Shops_2_pseudo']

In [36]:
# show_cm(data, features_list)

In [37]:
# data_prepared.info()

In [38]:
# data_prepared[features_list].head()

#### Test Dataset

In [39]:
test = pd.read_csv('test.csv')

In [40]:
def prepare_test(test, data):

    test.loc[test['Rooms'] == 17, 'Rooms'] = 2
    test.loc[test['Id'] == 170, 'Square'] = test.loc[test['Id'] == 170, 'LifeSquare'] / 0.628714

    test.loc[test['Square'] < 10, 'Square'] = test.loc[test['Square'] <10 , 'Square'] * 10

    test.loc[test['Square'] < 10, 'Square'] = test.loc[test['Square'] <10 , 'Square'] * 10
    test.loc[(test['Square'] < 50) & (test['Rooms'] > 2), 'Rooms'] = 2
    test.loc[test['Id'] == 6060, 'Rooms'] = 2

    test.loc[(test['LifeSquare'] < test['Square'] / 4), 'LifeSquare'] = np.NaN
    test.loc[test['LifeSquare'] > test['Square'], 'LifeSquare'] = np.NaN

    sup_df = test[['Rooms', 'Square', 'LifeSquare']]
    sup_df['ls_percent'] = test['LifeSquare'] / test['Square']
    sup_df = sup_df.groupby(['Rooms'], as_index=False)['ls_percent'].median()
    test.loc[test['LifeSquare'].isna(), 'new_LifeSquare'] = test.loc[test['LifeSquare'].isna(), ['Rooms', 'Square']].merge(sup_df, how='left', on='Rooms')['ls_percent'].values * test.loc[test['LifeSquare'].isna(), ['Rooms', 'Square']].merge(sup_df, how='left', on='Rooms')['Square'].values
    test.loc[~test['LifeSquare'].isna(), 'new_LifeSquare'] = test.loc[~test['LifeSquare'].isna(), 'LifeSquare']

    test = add_test_pseudos(test, data, ['Ecology_2', 'Ecology_3', 'DistrictId', 'Helthcare_2', 'Social_2', 'Shops_2'], 'Price')  # можно удалить "data = "
    test.loc[test['DistrictId_pseudo'].isna(), 'DistrictId_pseudo'] = data['DistrictId_pseudo'].mean()
    
    return test

In [41]:
test = prepare_test(test, data)

Ecology_2
Ecology_2_pseudo was added
Ecology_3
Ecology_3_pseudo was added
DistrictId
DistrictId_pseudo was added
Helthcare_2
Helthcare_2_pseudo was added
Social_2
Social_2_pseudo was added
Shops_2
Shops_2_pseudo was added


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [42]:
test[features_list].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
Rooms                 5000 non-null float64
Square                5000 non-null float64
KitchenSquare         5000 non-null float64
Floor                 5000 non-null int64
HouseFloor            5000 non-null float64
HouseYear             5000 non-null int64
Ecology_1             5000 non-null float64
new_LifeSquare        5000 non-null float64
DistrictId_pseudo     5000 non-null float64
Ecology_2_pseudo      5000 non-null float64
Ecology_3_pseudo      5000 non-null float64
Helthcare_2_pseudo    5000 non-null float64
Social_2              5000 non-null int64
Social_3              5000 non-null int64
Shops_1               5000 non-null int64
Shops_2_pseudo        5000 non-null float64
dtypes: float64(11), int64(5)
memory usage: 625.1 KB


#### Model fitting

In [43]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import metrics

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [44]:
features_list = ['Rooms', 'Square', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'new_LifeSquare',
       'DistrictId_pseudo', 'Ecology_2_pseudo', 'Ecology_3_pseudo', 'Helthcare_2_pseudo', 'Social_2', 'Social_3', 'Shops_1', 'Shops_2_pseudo']

In [45]:
estimator = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=16,
           max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=400, n_jobs=None, oob_score=False,
           random_state=100, verbose=0, warm_start=False)

In [46]:
estimator.fit(data_prepared[features_list], data_prepared['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=16,
           max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=400, n_jobs=None, oob_score=False,
           random_state=100, verbose=0, warm_start=False)

In [20]:
# cross_val_score(estimator, data_prepared[features_list], data_prepared['Price'], cv=15, scoring='r2')

#### Predictions

In [21]:
test['Price'] = estimator.predict(test[features_list])

In [22]:
test.loc[:, ['Id', 'Price']].to_csv('ANikitina_predictions.csv', sep=',', index=False)

In [23]:
# from sklearn.base import clone 

# def drop_col_feat_imp(model, X_train, y_train, random_state = 12):
    
#     # clone the model to have the exact same specification as the one initially trained
#     model_clone = clone(model)
#     # set random_state for comparability
#     model_clone.random_state = random_state
#     # training and scoring the benchmark model
#     model_clone.fit(X_train, y_train)
#     benchmark_score = model_clone.score(X_train, y_train)
#     # list for storing feature importances
#     importances = []
    
#     # iterating over all columns and storing feature importance (difference between benchmark and new model)
#     for col in X_train.columns:
#         model_clone = clone(model)
#         model_clone.random_state = random_state
#         model_clone.fit(X_train.drop(col, axis = 1), y_train)
#         drop_col_score = model_clone.score(X_train.drop(col, axis = 1), y_train)
#         importances.append(benchmark_score - drop_col_score)
    
#     importances_df = pd.DataFrame(importances, index=X_train.columns)
#     return importances_df

In [24]:
# imp_df = drop_col_feat_imp(estimator, data[features_list], data['Price'])

In [26]:
# estimator.feature_importances_

array([0.11154013, 0.2368304 , 0.04308136, 0.02607578, 0.02727534,
       0.04033581, 0.02755906, 0.11462304, 0.19562722, 0.00024704,
       0.00106414, 0.02410976, 0.0736775 , 0.04907381, 0.02717336,
       0.00170626])