In [34]:
import numpy as np
import pandas as pd

In [35]:
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

Cleaning data functions

In [36]:
#filling values instead of NaN
def fill_none(df):
    df['LifeSquare'] = df['LifeSquare'].fillna(df['Square'].mean())
    df['Healthcare_1'] = df['Healthcare_1'].fillna(df['Healthcare_1'].mean())
    return df

In [37]:
#changing HouseYear
def clean_year(df):
    df.loc[(~df['HouseYear'].between(1850, 2020)),'HouseYear'] = 1969
    return df

In [38]:
#changing Rooms from 0 
def zero_rooms_change(df):
    df.loc[df['Rooms'] == 0 , 'Rooms'] = df['Square']//20
    df.loc[df['Rooms'] == 0 , 'Rooms'] = 1
    return df

In [39]:
#Kitchensquare cleaning
def clean_kitchensquare(df):
    df.loc[df['KitchenSquare'] > 25,'KitchenSquare'] = 8
    return df

In [40]:
def clean_data(df):
    df = fill_none(df)
    df = clean_year(df)
    df = zero_rooms_change(df)
    df = clean_kitchensquare(df)
    df = df.drop('Id', axis = 1)
    return df    

In [41]:
def price_for_meter(a):
    #a - object DataFrame with grouped values
    a['PriceForMeter'] = a['Price']/a['Square']
    return a

In [42]:
#download

In [43]:
flats_db = pd.read_csv('E:/python/Kurs_project_task/train.csv')

In [44]:
flats_db_test = pd.read_csv('E:/python/Kurs_project_task/test.csv')

In [45]:
test = flats_db_test.copy()

In [46]:
train, valid = train_test_split(flats_db,  test_size=0.3, random_state=42)

In [47]:
y_train = pd.DataFrame(train['Price'])
y_valid = pd.DataFrame(valid['Price'])

In [48]:
# PriceForMeter column added
flats_prices = price_for_meter(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [49]:
flats_prices_clean = clean_data(flats_prices)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [50]:
flats_prices_valid_clean = valid.drop('Price', axis = 1)

In [51]:
flats_prices_valid_clean = clean_data(flats_prices_valid_clean)

In [52]:
flats_db_test = clean_data(flats_db_test)

In [53]:
#adding a column with mean price for square meter in each district to train and valid datasets
district_mean_priceformeter = flats_prices_clean.groupby('DistrictId', as_index=False)['PriceForMeter'].mean().rename\
            (columns={'PriceForMeter': 'MeanPricePerMeterDist'})
flats_prices_clean = pd.merge(flats_prices_clean, district_mean_priceformeter, on='DistrictId', how='left')
flats_prices_valid_clean = pd.merge(flats_prices_valid_clean, district_mean_priceformeter, on='DistrictId', how='left')

In [54]:
#adding a column with mean price for square meter in each district to test dataset
flats_db_test = pd.merge(flats_db_test, district_mean_priceformeter, on='DistrictId', how='left')

In [55]:
flats_prices_clean = flats_prices_clean.drop('Price', axis = 1)

In [56]:
flats_prices_clean = flats_prices_clean.drop('PriceForMeter', axis = 1)

In [57]:
flats_prices_clean = pd.get_dummies(flats_prices_clean)
feats = flats_prices_clean.columns

In [58]:
flats_prices_valid_clean['MeanPricePerMeterDist'] =  flats_prices_valid_clean['MeanPricePerMeterDist'].fillna(flats_prices_valid_clean['MeanPricePerMeterDist'].mean())

In [59]:
flats_db_test['MeanPricePerMeterDist'] =  flats_db_test['MeanPricePerMeterDist'].fillna(flats_db_test['MeanPricePerMeterDist'].mean())

In [60]:
flats_prices_valid_clean = pd.get_dummies(flats_prices_valid_clean)

In [61]:
flats_db_test = pd.get_dummies(flats_db_test)

In [62]:
rfr_price = RandomForestRegressor(max_depth=21, max_features=4, n_estimators=250, random_state=100)

In [63]:
rfr_price.fit(flats_prices_clean, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=21,
           max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=None, oob_score=False,
           random_state=100, verbose=0, warm_start=False)

In [64]:
y_pred_clean = rfr_price.predict(flats_prices_valid_clean)

In [65]:
r2_score(y_valid, y_pred_clean)

0.7344163802077505

In [66]:
test['Price'] = rfr_price.predict(flats_db_test)

In [67]:
test.loc[:, ['Id', 'Price']].to_csv('E:/python/Kurs_project_task/AMoroz_predictions_idfixed.csv', index=None)