In [3]:
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
import xgboost as xgb

import datetime
from scipy.stats import norm
    
#load files
train = pd.read_csv('../input/train.csv', parse_dates=['timestamp'])
test = pd.read_csv('../input/test.csv', parse_dates=['timestamp'])
macro = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'])
train_loc = pd.read_csv("../input/train_lat_lon.csv")
test_loc = pd.read_csv("../input/test_lat_lon.csv")

id_test = test.id
train.drop(train[train["life_sq"] > 7000].index, inplace=True)

test['lat']= test_loc['lat']
test['lon']= test_loc['lon']
train['lat']= train_loc['lat']
train['lon']= train_loc['lon']

#clean data
print('Data Clean...')
bad_index = train[train.life_sq > train.full_sq].index
train.loc[bad_index, "life_sq"] = np.NaN
equal_index = [601,1896,2791]
test.loc[equal_index, "life_sq"] = test.loc[equal_index, "full_sq"]
bad_index = test[test.life_sq > test.full_sq].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.life_sq < 5].index
train.loc[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq < 5].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.full_sq < 5].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[test.full_sq < 5].index
test.loc[bad_index, "full_sq"] = np.NaN
kitch_is_build_year = [13117]
train.loc[kitch_is_build_year, "build_year"] = train.loc[kitch_is_build_year, "kitch_sq"]
bad_index = train[train.kitch_sq >= train.life_sq].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[test.kitch_sq >= test.life_sq].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.kitch_sq == 0).values + (train.kitch_sq == 1).values].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq == 0).values + (test.kitch_sq == 1).values].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3)].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3)].index
test.loc[bad_index, "full_sq"] = np.NaN
bad_index = train[train.life_sq > 300].index
train.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = test[test.life_sq > 200].index
test.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
train.product_type.value_counts(normalize= True)
test.product_type.value_counts(normalize= True)
bad_index = train[train.build_year < 1500].index
train.loc[bad_index, "build_year"] = np.NaN
bad_index = test[test.build_year < 1500].index
test.loc[bad_index, "build_year"] = np.NaN
bad_index = train[train.num_room == 0].index
train.loc[bad_index, "num_room"] = np.NaN
bad_index = test[test.num_room == 0].index
test.loc[bad_index, "num_room"] = np.NaN
bad_index = [10076, 11621, 17764, 19390, 24007, 26713, 29172]
train.loc[bad_index, "num_room"] = np.NaN
bad_index = [3174, 7313]
test.loc[bad_index, "num_room"] = np.NaN
bad_index = train[(train.floor == 0).values * (train.max_floor == 0).values].index
train.loc[bad_index, ["max_floor", "floor"]] = np.NaN
bad_index = train[train.floor == 0].index
train.loc[bad_index, "floor"] = np.NaN
bad_index = train[train.max_floor == 0].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.max_floor == 0].index
test.loc[bad_index, "max_floor"] = np.NaN
bad_index = train[train.floor > train.max_floor].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.floor > test.max_floor].index
test.loc[bad_index, "max_floor"] = np.NaN
train.floor.describe(percentiles= [0.9999])
bad_index = [23584]
train.loc[bad_index, "floor"] = np.NaN
train.material.value_counts()
test.material.value_counts()
train.state.value_counts()
bad_index = train[train.state == 33].index
train.loc[bad_index, "state"] = np.NaN
test.state.value_counts()

# brings error down a lot by removing extreme price per sqm
train.loc[train.full_sq == 0, 'full_sq'] = 50
train = train[train.price_doc/train.full_sq <= 600000]
train = train[train.price_doc/train.full_sq >= 10000]

print('Feature Engineering...')
# Add month-year
month_year = (train.timestamp.dt.month*30 + train.timestamp.dt.year * 365)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)

month_year = (test.timestamp.dt.month*30 + test.timestamp.dt.year * 365)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (train.timestamp.dt.weekofyear*7 + train.timestamp.dt.year * 365)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)

week_year = (test.timestamp.dt.weekofyear*7 + test.timestamp.dt.year * 365)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek

test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek

# Other feature engineering
train['rel_floor'] = 0.05+train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = 0.05+train['kitch_sq'] / train['full_sq'].astype(float)

test['rel_floor'] = 0.05+test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = 0.05+test['kitch_sq'] / test['full_sq'].astype(float)

train.apartment_name=train.sub_area + train['metro_km_avto'].astype(str)
test.apartment_name=test.sub_area + train['metro_km_avto'].astype(str)

train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

train['area_per_room'] = train['life_sq'] / train['num_room'].astype(float) #rough area per room
train['livArea_ratio'] = train['life_sq'] / train['full_sq'].astype(float) #rough living area
train['yrs_old'] = 2017 - train['build_year'].astype(float) #years old from 2017
train['avgfloor_sq'] = train['life_sq']/train['max_floor'].astype(float) #living area per floor
train['pts_floor_ratio'] = train['public_transport_station_km']/train['max_floor'].astype(float)
# looking for significance of apartment buildings near public t 
train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
# doubled a var by accident
# when removing one score did not improve...
train['gender_ratio'] = train['male_f']/train['female_f'].astype(float)
train['kg_park_ratio'] = train['kindergarten_km']/train['park_km'].astype(float) #significance of children?
train['high_ed_extent'] = train['school_km'] / train['kindergarten_km'] #schooling
train['pts_x_state'] = train['public_transport_station_km'] * train['state'].astype(float) #public trans * state of listing
train['lifesq_x_state'] = train['life_sq'] * train['state'].astype(float) #life_sq times the state of the place
train['floor_x_state'] = train['floor'] * train['state'].astype(float) #relative floor * the state of the place

test['area_per_room'] = test['life_sq'] / test['num_room'].astype(float)
test['livArea_ratio'] = test['life_sq'] / test['full_sq'].astype(float)
test['yrs_old'] = 2017 - test['build_year'].astype(float)
test['avgfloor_sq'] = test['life_sq']/test['max_floor'].astype(float) #living area per floor
test['pts_floor_ratio'] = test['public_transport_station_km']/test['max_floor'].astype(float) #apartments near public t?
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)
test['gender_ratio'] = test['male_f']/test['female_f'].astype(float)
test['kg_park_ratio'] = test['kindergarten_km']/test['park_km'].astype(float)
test['high_ed_extent'] = test['school_km'] / test['kindergarten_km']
test['pts_x_state'] = test['public_transport_station_km'] * test['state'].astype(float) #public trans * state of listing
test['lifesq_x_state'] = test['life_sq'] * test['state'].astype(float)
test['floor_x_state'] = test['floor'] * test['state'].astype(float)

#########################################################################
print('Rate Mults...')
# Aggreagte house price data derived from 
# http://www.globalpropertyguide.com/real-estate-house-prices/R#russia
# by luckyzhou
# See https://www.kaggle.com/luckyzhou/lzhou-test/comments

rate_2015_q2 = 1
rate_2015_q1 = rate_2015_q2 / 0.9932
rate_2014_q4 = rate_2015_q1 / 1.0112
rate_2014_q3 = rate_2014_q4 / 1.0169
rate_2014_q2 = rate_2014_q3 / 1.0086
rate_2014_q1 = rate_2014_q2 / 1.0126
rate_2013_q4 = rate_2014_q1 / 0.9902
rate_2013_q3 = rate_2013_q4 / 1.0041
rate_2013_q2 = rate_2013_q3 / 1.0044
rate_2013_q1 = rate_2013_q2 / 1.0104  # This is 1.002 (relative to mult), close to 1:
rate_2012_q4 = rate_2013_q1 / 0.9832  #     maybe use 2013q1 as a base quarter and get rid of mult?
rate_2012_q3 = rate_2012_q4 / 1.0277
rate_2012_q2 = rate_2012_q3 / 1.0279
rate_2012_q1 = rate_2012_q2 / 1.0279
rate_2011_q4 = rate_2012_q1 / 1.076
rate_2011_q3 = rate_2011_q4 / 1.0236
rate_2011_q2 = rate_2011_q3 / 1
rate_2011_q1 = rate_2011_q2 / 1.011


# train 2015
train['average_q_price'] = 1

train_2015_q2_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2015_q2_index, 'average_q_price'] = rate_2015_q2

train_2015_q1_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2015_q1_index, 'average_q_price'] = rate_2015_q1


# train 2014
train_2014_q4_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2014_q4_index, 'average_q_price'] = rate_2014_q4

train_2014_q3_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2014_q3_index, 'average_q_price'] = rate_2014_q3

train_2014_q2_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2014_q2_index, 'average_q_price'] = rate_2014_q2

train_2014_q1_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2014_q1_index, 'average_q_price'] = rate_2014_q1


# train 2013
train_2013_q4_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2013_q4_index, 'average_q_price'] = rate_2013_q4

train_2013_q3_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2013_q3_index, 'average_q_price'] = rate_2013_q3

train_2013_q2_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2013_q2_index, 'average_q_price'] = rate_2013_q2

train_2013_q1_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2013_q1_index, 'average_q_price'] = rate_2013_q1


# train 2012
train_2012_q4_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2012_q4_index, 'average_q_price'] = rate_2012_q4

train_2012_q3_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2012_q3_index, 'average_q_price'] = rate_2012_q3

train_2012_q2_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2012_q2_index, 'average_q_price'] = rate_2012_q2

train_2012_q1_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2012_q1_index, 'average_q_price'] = rate_2012_q1


# train 2011
train_2011_q4_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2011_q4_index, 'average_q_price'] = rate_2011_q4

train_2011_q3_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2011_q3_index, 'average_q_price'] = rate_2011_q3

train_2011_q2_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2011_q2_index, 'average_q_price'] = rate_2011_q2

train_2011_q1_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2011_q1_index, 'average_q_price'] = rate_2011_q1

train['price_doc'] = train['price_doc'] * train['average_q_price']

mult = 1.054880504
train['price_doc'] = train['price_doc'] * mult
y_train = train["price_doc"]

#########################################################################################################
print('Running Model 1...')
x_train = train.drop(["id", "timestamp", "price_doc", "average_q_price"], axis=1)
#x_test = test.drop(["id", "timestamp", "average_q_price"], axis=1)
x_test = test.drop(["id", "timestamp"], axis=1)

num_train = len(x_train)
x_all = pd.concat([x_train, x_test])

for c in x_all.columns:
    if x_all[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_all[c].values))
        x_all[c] = lbl.transform(list(x_all[c].values))
        


Data Clean...
Feature Engineering...
Rate Mults...
Running Model 1...


In [4]:
train.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,yrs_old,avgfloor_sq,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,average_q_price
0,1,2011-08-20,43.0,27.0,4.0,,,,,,...,,,,0.885149,0.067498,1.221523,,,,0.808051
1,2,2011-08-23,34.0,19.0,3.0,,,,,,...,,,,0.81266,0.268308,1.849999,,,,0.808051
2,3,2011-08-27,43.0,29.0,2.0,,,,,,...,,,,0.824169,0.130991,3.219286,,,,0.808051
3,4,2011-09-01,89.0,50.0,9.0,,,,,,...,,,,0.867661,2.297865,1.317732,,,,0.808051
4,5,2011-09-05,77.0,77.0,4.0,,,,,,...,,,,0.922563,0.959783,1.520114,,,,0.808051


In [5]:
test.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,livArea_ratio,yrs_old,avgfloor_sq,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state
0,30474,2015-07-01,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,...,0.530769,19.0,2.3,0.012682,0.867921,0.038322,9.515225,0.342401,62.1,6.0
1,30475,2015-07-01,79.2,,8,17.0,1,,3.0,,...,,,,0.048593,0.884253,0.270927,1.117747,0.826083,,8.0
2,30476,2015-07-01,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,...,0.619753,57.0,5.02,0.023337,0.813867,0.025994,2.979114,0.233371,50.2,6.0
3,30477,2015-07-01,62.8,36.0,17,17.0,1,2016.0,2.0,,...,0.573248,1.0,2.117647,0.181953,0.901473,0.568177,1.11007,9.279626,108.0,51.0
4,30478,2015-07-01,40.0,40.0,17,17.0,1,,1.0,,...,1.0,,2.352941,0.03706,0.884253,0.196621,1.374597,0.630014,40.0,17.0


In [6]:
x_train.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,livArea_ratio,yrs_old,avgfloor_sq,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state
0,43.0,27.0,4.0,,,,,,,Investment,...,0.627907,,,,0.885149,0.067498,1.221523,,,
1,34.0,19.0,3.0,,,,,,,Investment,...,0.558824,,,,0.81266,0.268308,1.849999,,,
2,43.0,29.0,2.0,,,,,,,Investment,...,0.674419,,,,0.824169,0.130991,3.219286,,,
3,89.0,50.0,9.0,,,,,,,Investment,...,0.561798,,,,0.867661,2.297865,1.317732,,,
4,77.0,77.0,4.0,,,,,,,Investment,...,1.0,,,,0.922563,0.959783,1.520114,,,


In [7]:
all.shape()

AttributeError: 'builtin_function_or_method' object has no attribute 'shape'

In [8]:
x_all.shape()

TypeError: 'tuple' object is not callable

In [9]:
train['building_age'] = train['timestamp'].dt.year - train['build_year']
train.loc[df_all['building_age'] < 0, 'building_age'] = np.nan

train['room_sq'] = (train['life_sq']-train['kitch_sq'])/train['num_room']
train.loc[df_all['room_sq']<1,'room_sq']=np.nan

train['floor_inverse'] = train['max_floor']-train['floor']



test['building_age'] = test['timestamp'].dt.year - test['build_year']
test.loc[df_all['building_age'] < 0, 'building_age'] = np.nan

test['room_sq'] = (train['life_sq']-test['kitch_sq'])/test['num_room']
test.loc[df_all['room_sq']<1,'room_sq']=np.nan

test['floor_inverse'] = test['max_floor']-test['floor']

NameError: name 'df_all' is not defined

In [10]:
train['building_age'] = train['timestamp'].dt.year - train['build_year']
train.loc[train['building_age'] < 0, 'building_age'] = np.nan

train['room_sq'] = (train['life_sq']-train['kitch_sq'])/train['num_room']
train.loc[train['room_sq']<1,'room_sq']=np.nan

train['floor_inverse'] = train['max_floor']-train['floor']



test['building_age'] = test['timestamp'].dt.year - test['build_year']
test.loc[train['building_age'] < 0, 'building_age'] = np.nan

test['room_sq'] = (train['life_sq']-test['kitch_sq'])/test['num_room']
test.loc[train['room_sq']<1,'room_sq']=np.nan

test['floor_inverse'] = test['max_floor']-test['floor']

IndexingError: Unalignable boolean Series key provided

In [11]:
train['building_age'] = train['timestamp'].dt.year - train['build_year']
train.loc[train['building_age'] < 0, 'building_age'] = np.nan

train['room_sq'] = (train['life_sq']-train['kitch_sq'])/train['num_room']
train.loc[train['room_sq']<1,'room_sq']=np.nan

train['floor_inverse'] = train['max_floor']-train['floor']



test['building_age'] = test['timestamp'].dt.year - test['build_year']
test.loc[test['building_age'] < 0, 'building_age'] = np.nan

test['room_sq'] = (test['life_sq']-test['kitch_sq'])/test['num_room']
test.loc[test['room_sq']<1,'room_sq']=np.nan

test['floor_inverse'] = test['max_floor']-test['floor']

In [12]:
test.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,30474,2015-07-01,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,...,0.012682,0.867921,0.038322,9.515225,0.342401,62.1,6.0,17.0,11.8,7.0
1,30475,2015-07-01,79.2,,8,17.0,1,,3.0,,...,0.048593,0.884253,0.270927,1.117747,0.826083,,8.0,,,9.0
2,30476,2015-07-01,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,...,0.023337,0.813867,0.025994,2.979114,0.233371,50.2,6.0,55.0,10.15,2.0
3,30477,2015-07-01,62.8,36.0,17,17.0,1,2016.0,2.0,,...,0.181953,0.901473,0.568177,1.11007,9.279626,108.0,51.0,,,0.0
4,30478,2015-07-01,40.0,40.0,17,17.0,1,,1.0,,...,0.03706,0.884253,0.196621,1.374597,0.630014,40.0,17.0,,,0.0


In [13]:
x_train = train.drop(["average_q_price"], axis=1)

In [14]:
x_train.to_scv('new_train.csv',index=False)
x_test.to_scv('new_test.csv',index=False)

AttributeError: 'DataFrame' object has no attribute 'to_scv'

In [15]:
x_train.to_csv('new_train.csv',index=False)
x_test.to_csv('new_test.csv',index=False)

In [16]:
train.drop(['id', 'price_doc'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

num_train = len(train)
df_all = pd.concat([train, test])

In [17]:
df_all = df_all.join(macro, on='timestamp', rsuffix='_macro')

In [18]:
print(df_all.shape)


(38057, 414)


In [19]:
X_train = X_all[:num_train]
X_test = X_all[num_train:]

NameError: name 'X_all' is not defined

In [20]:
X_train = X_all[:num_train]
X_test = X_all[num_train:]

NameError: name 'X_all' is not defined

In [21]:
df_all.drop(['timestamp', 'timestamp_macro'], axis=1, inplace=True)

In [22]:
factorize = lambda t: pd.factorize(t[1])[0]

df_obj = df_all.select_dtypes(include=['object'])

X_all = np.c_[
    df_all.select_dtypes(exclude=['object']).values,
    np.array(list(map(factorize, df_obj.iteritems()))).T
]
print(X_all.shape)

X_train = X_all[:num_train]
X_test = X_all[num_train:]


# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()

for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]

df_values = pd.concat([df_numeric, df_obj], axis=1)


# Convert to numpy values
X_all = df_values.values
print(X_all.shape)

X_train = X_all[:num_train]
X_test = X_all[num_train:]

(38057L, 412L)
(38057L, 412L)


In [23]:
X_train.to_csv('new_train.csv',index=False)


AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

In [24]:
df_all.shape()

TypeError: 'tuple' object is not callable

In [25]:
df_all

Unnamed: 0,0_13_all,0_13_female,0_13_male,0_17_all,0_17_female,0_17_male,0_6_all,0_6_female,0_6_male,16_29_all,...,provision_retail_space_modern_sqm,turnover_catering_per_cap,theaters_viewers_per_1000_cap,seats_theather_rfmin_per_100000_cap,museum_visitis_per_100_cap,bandwidth_sports,population_reg_sports_share,students_reg_sports_share,apartment_build,apartment_fund_sqm
0,18654,8945,9709,23603,11317,12286,9576,4677,4899,17508,...,,,,,,,,,,
1,13729,6800,6929,17700,8702,8998,6880,3414,3466,15164,...,,,,,,,,,,
2,11252,5336,5916,14884,7063,7821,5879,2784,3095,19401,...,,,,,,,,,,
3,24934,12152,12782,32063,15550,16513,13087,6442,6645,3292,...,,,,,,,,,,
4,11631,5408,6223,15237,7124,8113,5706,2724,2982,5164,...,,,,,,,,,,
5,4632,2233,2399,5866,2831,3035,2418,1194,1224,4851,...,,,,,,,,,,
6,4884,2377,2507,6510,3165,3345,2459,1218,1241,19445,...,,,,,,,,,,
7,18654,8945,9709,23603,11317,12286,9576,4677,4899,17508,...,,,,,,,,,,
8,12269,5782,6487,15510,7309,8201,6507,3051,3456,17662,...,,,,,,,,,,
9,17469,8387,9082,22071,10587,11484,9347,4541,4806,15929,...,,,,,,,,,,


In [26]:
class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models
    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], len(folds)))
            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                # y_holdout = y[test_idx]
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(1)
        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)[:]
        return y_pred

In [27]:
import pickle
import xgboost as xgb

import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
from xgboost import XGBRegressor

In [45]:
xgb1 = XGBRegressor(
    learning_rate=0.1,
    max_depth=6,
    subsample= 0.6,
    colsample_bytree= 1,
    objective='reg:linear',
    nthread =4,
    seed=317
)

In [44]:
xgb2 = XGBRegressor(
    learning_rate=0.2,
    max_depth=6,
    subsample= 0.7,
    colsample_bytree= 0.7,
    objective= 'reg:linear',
    nthread =4,
    seed=624
)

## xgb2 = XGBRegressor(
    learning_rate=0.05,
    max_depth=5,
    subsample= 0.7,
    colsample_bytree= 0.7,
    objective= 'reg:linear',
    nthread =4,
    seed=2017
)

In [43]:
xgb3 = XGBRegressor(
learning_rate=0.05,
max_depth=5,
subsample= 0.7,
colsample_bytree= 0.7,
objective= 'reg:linear',
nthread =4,
seed=2017
)

In [46]:
from sklearn.ensemble import RandomForestRegressor

In [47]:
from sklearn.ensemble import GradientBoostingRegressor

In [48]:
clf0 = GradientBoostingRegressor()

In [49]:
clf1 = RandomForestRegressor()

In [50]:
stacker = XGBRegressor()

In [51]:
base_models = [clf0,clf1,xgb1,xgb2,xgb3]

In [52]:
ems = Emsemble(5,stacker,base_models)

NameError: name 'Emsemble' is not defined

In [53]:
ens = Ensemble(5,stacker,base_models)

In [54]:
train_x


NameError: name 'train_x' is not defined

In [55]:
train

Unnamed: 0,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,...,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,average_q_price,building_age,room_sq,floor_inverse
0,2011-08-20,43.0,27.0,4.0,,,,,,,...,0.885149,0.067498,1.221523,,,,0.808051,,,
1,2011-08-23,34.0,19.0,3.0,,,,,,,...,0.812660,0.268308,1.849999,,,,0.808051,,,
2,2011-08-27,43.0,29.0,2.0,,,,,,,...,0.824169,0.130991,3.219286,,,,0.808051,,,
3,2011-09-01,89.0,50.0,9.0,,,,,,,...,0.867661,2.297865,1.317732,,,,0.808051,,,
4,2011-09-05,77.0,77.0,4.0,,,,,,,...,0.922563,0.959783,1.520114,,,,0.808051,,,
5,2011-09-06,67.0,46.0,14.0,,,,,,,...,0.891841,0.135962,0.777406,,,,0.808051,,,
6,2011-09-08,25.0,14.0,10.0,,,,,,,...,0.900336,0.209409,1.031554,,,,0.808051,,,
7,2011-09-09,44.0,44.0,5.0,,,,,,,...,0.885149,0.083870,4.327839,,,,0.808051,,,
8,2011-09-10,42.0,27.0,5.0,,,,,,,...,0.844847,0.120164,2.272622,,,,0.808051,,,
9,2011-09-13,36.0,21.0,9.0,,,,,,,...,0.893788,0.157212,0.486598,,,,0.808051,,,


In [56]:
train.head()

Unnamed: 0,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,...,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,average_q_price,building_age,room_sq,floor_inverse
0,2011-08-20,43.0,27.0,4.0,,,,,,,...,0.885149,0.067498,1.221523,,,,0.808051,,,
1,2011-08-23,34.0,19.0,3.0,,,,,,,...,0.81266,0.268308,1.849999,,,,0.808051,,,
2,2011-08-27,43.0,29.0,2.0,,,,,,,...,0.824169,0.130991,3.219286,,,,0.808051,,,
3,2011-09-01,89.0,50.0,9.0,,,,,,,...,0.867661,2.297865,1.317732,,,,0.808051,,,
4,2011-09-05,77.0,77.0,4.0,,,,,,,...,0.922563,0.959783,1.520114,,,,0.808051,,,


In [57]:
train['price_doc']

KeyError: 'price_doc'

In [58]:
test.head()

Unnamed: 0,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,2015-07-01,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,3.0,...,0.012682,0.867921,0.038322,9.515225,0.342401,62.1,6.0,17.0,11.8,7.0
1,2015-07-01,79.2,,8,17.0,1,,3.0,,1.0,...,0.048593,0.884253,0.270927,1.117747,0.826083,,8.0,,,9.0
2,2015-07-01,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,2.0,...,0.023337,0.813867,0.025994,2.979114,0.233371,50.2,6.0,55.0,10.15,2.0
3,2015-07-01,62.8,36.0,17,17.0,1,2016.0,2.0,,3.0,...,0.181953,0.901473,0.568177,1.11007,9.279626,108.0,51.0,,,0.0
4,2015-07-01,40.0,40.0,17,17.0,1,,1.0,,1.0,...,0.03706,0.884253,0.196621,1.374597,0.630014,40.0,17.0,,,0.0


In [59]:
train['price_doc']

KeyError: 'price_doc'

In [60]:
x_train.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,1,2011-08-20,43.0,27.0,4.0,,,,,,...,,0.885149,0.067498,1.221523,,,,,,
1,2,2011-08-23,34.0,19.0,3.0,,,,,,...,,0.81266,0.268308,1.849999,,,,,,
2,3,2011-08-27,43.0,29.0,2.0,,,,,,...,,0.824169,0.130991,3.219286,,,,,,
3,4,2011-09-01,89.0,50.0,9.0,,,,,,...,,0.867661,2.297865,1.317732,,,,,,
4,5,2011-09-05,77.0,77.0,4.0,,,,,,...,,0.922563,0.959783,1.520114,,,,,,


In [61]:
y_train.head()

0    4.986523e+06
1    5.114382e+06
2    4.858663e+06
3    1.116640e+07
4    1.392088e+07
Name: price_doc, dtype: float64

In [62]:
x_test.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,livArea_ratio,yrs_old,avgfloor_sq,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state
0,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,3.0,Investment,...,0.530769,19.0,2.3,0.012682,0.867921,0.038322,9.515225,0.342401,62.1,6.0
1,79.2,,8,17.0,1,,3.0,,1.0,OwnerOccupier,...,,,,0.048593,0.884253,0.270927,1.117747,0.826083,,8.0
2,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,2.0,Investment,...,0.619753,57.0,5.02,0.023337,0.813867,0.025994,2.979114,0.233371,50.2,6.0
3,62.8,36.0,17,17.0,1,2016.0,2.0,,3.0,OwnerOccupier,...,0.573248,1.0,2.117647,0.181953,0.901473,0.568177,1.11007,9.279626,108.0,51.0
4,40.0,40.0,17,17.0,1,,1.0,,1.0,OwnerOccupier,...,1.0,,2.352941,0.03706,0.884253,0.196621,1.374597,0.630014,40.0,17.0


In [63]:
import pandas as pd


In [64]:
train = pd.read_csv('new_train.csv')

In [65]:
train.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,1,2011-08-20,43.0,27.0,4.0,,,,,,...,,0.885149,0.067498,1.221523,,,,,,
1,2,2011-08-23,34.0,19.0,3.0,,,,,,...,,0.81266,0.268308,1.849999,,,,,,
2,3,2011-08-27,43.0,29.0,2.0,,,,,,...,,0.824169,0.130991,3.219286,,,,,,
3,4,2011-09-01,89.0,50.0,9.0,,,,,,...,,0.867661,2.297865,1.317732,,,,,,
4,5,2011-09-05,77.0,77.0,4.0,,,,,,...,,0.922563,0.959783,1.520114,,,,,,


In [66]:
test = pd.read_csv('new_test.csv')

In [67]:
test.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,livArea_ratio,yrs_old,avgfloor_sq,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state
0,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,3.0,Investment,...,0.530769,19.0,2.3,0.012682,0.867921,0.038322,9.515225,0.342401,62.1,6.0
1,79.2,,8,17.0,1,,3.0,,1.0,OwnerOccupier,...,,,,0.048593,0.884253,0.270927,1.117747,0.826083,,8.0
2,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,2.0,Investment,...,0.619753,57.0,5.02,0.023337,0.813867,0.025994,2.979114,0.233371,50.2,6.0
3,62.8,36.0,17,17.0,1,2016.0,2.0,,3.0,OwnerOccupier,...,0.573248,1.0,2.117647,0.181953,0.901473,0.568177,1.11007,9.279626,108.0,51.0
4,40.0,40.0,17,17.0,1,,1.0,,1.0,OwnerOccupier,...,1.0,,2.352941,0.03706,0.884253,0.196621,1.374597,0.630014,40.0,17.0


In [74]:
test['building_age'] = old_test['timestamp'].dt.year - test['build_year']
test.loc[test['building_age'] < 0, 'building_age'] = np.nan

test['room_sq'] = (test['life_sq']-test['kitch_sq'])/test['num_room']
test.loc[test['room_sq']<1,'room_sq']=np.nan

test['floor_inverse'] = test['max_floor']-test['floor']

In [73]:
old_test = pd.read_csv('../input/test.csv', parse_dates=['timestamp'])

In [75]:
test.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,3.0,Investment,...,0.012682,0.867921,0.038322,9.515225,0.342401,62.1,6.0,17.0,11.8,7.0
1,79.2,,8,17.0,1,,3.0,,1.0,OwnerOccupier,...,0.048593,0.884253,0.270927,1.117747,0.826083,,8.0,,,9.0
2,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,2.0,Investment,...,0.023337,0.813867,0.025994,2.979114,0.233371,50.2,6.0,55.0,10.15,2.0
3,62.8,36.0,17,17.0,1,2016.0,2.0,,3.0,OwnerOccupier,...,0.181953,0.901473,0.568177,1.11007,9.279626,108.0,51.0,,,0.0
4,40.0,40.0,17,17.0,1,,1.0,,1.0,OwnerOccupier,...,0.03706,0.884253,0.196621,1.374597,0.630014,40.0,17.0,,,0.0


In [76]:
test['lat']= test_loc['lat']
test['lon']= test_loc['lon']

In [77]:
test.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,3.0,Investment,...,0.012682,0.867921,0.038322,9.515225,0.342401,62.1,6.0,17.0,11.8,7.0
1,79.2,,8,17.0,1,,3.0,,1.0,OwnerOccupier,...,0.048593,0.884253,0.270927,1.117747,0.826083,,8.0,,,9.0
2,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,2.0,Investment,...,0.023337,0.813867,0.025994,2.979114,0.233371,50.2,6.0,55.0,10.15,2.0
3,62.8,36.0,17,17.0,1,2016.0,2.0,,3.0,OwnerOccupier,...,0.181953,0.901473,0.568177,1.11007,9.279626,108.0,51.0,,,0.0
4,40.0,40.0,17,17.0,1,,1.0,,1.0,OwnerOccupier,...,0.03706,0.884253,0.196621,1.374597,0.630014,40.0,17.0,,,0.0


In [78]:
test['lat']= old_test['id']
test['lon']= old_test['timestamp']

In [79]:
test.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,3.0,Investment,...,0.012682,0.867921,0.038322,9.515225,0.342401,62.1,6.0,17.0,11.8,7.0
1,79.2,,8,17.0,1,,3.0,,1.0,OwnerOccupier,...,0.048593,0.884253,0.270927,1.117747,0.826083,,8.0,,,9.0
2,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,2.0,Investment,...,0.023337,0.813867,0.025994,2.979114,0.233371,50.2,6.0,55.0,10.15,2.0
3,62.8,36.0,17,17.0,1,2016.0,2.0,,3.0,OwnerOccupier,...,0.181953,0.901473,0.568177,1.11007,9.279626,108.0,51.0,,,0.0
4,40.0,40.0,17,17.0,1,,1.0,,1.0,OwnerOccupier,...,0.03706,0.884253,0.196621,1.374597,0.630014,40.0,17.0,,,0.0


In [80]:
test['lat']= test_loc['lat']
test['lon']= test_loc['lon']
test['id']= old_test['id']
test['timestamp']= old_test['timestamp']

In [81]:
test.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse,id,timestamp
0,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,3.0,Investment,...,0.038322,9.515225,0.342401,62.1,6.0,17.0,11.8,7.0,30474,2015-07-01
1,79.2,,8,17.0,1,,3.0,,1.0,OwnerOccupier,...,0.270927,1.117747,0.826083,,8.0,,,9.0,30475,2015-07-01
2,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,2.0,Investment,...,0.025994,2.979114,0.233371,50.2,6.0,55.0,10.15,2.0,30476,2015-07-01
3,62.8,36.0,17,17.0,1,2016.0,2.0,,3.0,OwnerOccupier,...,0.568177,1.11007,9.279626,108.0,51.0,,,0.0,30477,2015-07-01
4,40.0,40.0,17,17.0,1,,1.0,,1.0,OwnerOccupier,...,0.196621,1.374597,0.630014,40.0,17.0,,,0.0,30478,2015-07-01


In [82]:
test.to_csv('new_test.csv')

In [83]:
test.to_csv('new_test.csv',index=False)

In [84]:
train.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,1,2011-08-20,43.0,27.0,4.0,,,,,,...,,0.885149,0.067498,1.221523,,,,,,
1,2,2011-08-23,34.0,19.0,3.0,,,,,,...,,0.81266,0.268308,1.849999,,,,,,
2,3,2011-08-27,43.0,29.0,2.0,,,,,,...,,0.824169,0.130991,3.219286,,,,,,
3,4,2011-09-01,89.0,50.0,9.0,,,,,,...,,0.867661,2.297865,1.317732,,,,,,
4,5,2011-09-05,77.0,77.0,4.0,,,,,,...,,0.922563,0.959783,1.520114,,,,,,


In [85]:
train.drop(['id','timestamp'],axis=1,inplace=True)
test.drop(['id','timestamp'],axis=1,inplace=True)

In [86]:
train.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,43.0,27.0,4.0,,,,,,,Investment,...,,0.885149,0.067498,1.221523,,,,,,
1,34.0,19.0,3.0,,,,,,,Investment,...,,0.81266,0.268308,1.849999,,,,,,
2,43.0,29.0,2.0,,,,,,,Investment,...,,0.824169,0.130991,3.219286,,,,,,
3,89.0,50.0,9.0,,,,,,,Investment,...,,0.867661,2.297865,1.317732,,,,,,
4,77.0,77.0,4.0,,,,,,,Investment,...,,0.922563,0.959783,1.520114,,,,,,


In [87]:
train_y = train['price_doc']

In [88]:
train_x = train.drop(['price_doc'],axis=1)

In [89]:
train_x.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,43.0,27.0,4.0,,,,,,,Investment,...,,0.885149,0.067498,1.221523,,,,,,
1,34.0,19.0,3.0,,,,,,,Investment,...,,0.81266,0.268308,1.849999,,,,,,
2,43.0,29.0,2.0,,,,,,,Investment,...,,0.824169,0.130991,3.219286,,,,,,
3,89.0,50.0,9.0,,,,,,,Investment,...,,0.867661,2.297865,1.317732,,,,,,
4,77.0,77.0,4.0,,,,,,,Investment,...,,0.922563,0.959783,1.520114,,,,,,


In [90]:
test.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,39.0,20.7,2,9.0,1,1998.0,1.0,8.9,3.0,Investment,...,0.012682,0.867921,0.038322,9.515225,0.342401,62.1,6.0,17.0,11.8,7.0
1,79.2,,8,17.0,1,,3.0,,1.0,OwnerOccupier,...,0.048593,0.884253,0.270927,1.117747,0.826083,,8.0,,,9.0
2,40.5,25.1,3,5.0,2,1960.0,2.0,4.8,2.0,Investment,...,0.023337,0.813867,0.025994,2.979114,0.233371,50.2,6.0,55.0,10.15,2.0
3,62.8,36.0,17,17.0,1,2016.0,2.0,,3.0,OwnerOccupier,...,0.181953,0.901473,0.568177,1.11007,9.279626,108.0,51.0,,,0.0
4,40.0,40.0,17,17.0,1,,1.0,,1.0,OwnerOccupier,...,0.03706,0.884253,0.196621,1.374597,0.630014,40.0,17.0,,,0.0


In [94]:
from sklearn.cross_validation import KFold
ens.fit_predict(train_x,train_y,test)

ValueError: could not convert string to float: poor

In [92]:
ens

<__main__.Ensemble at 0xcca1898>

In [93]:
ens.n_folds

5

In [95]:
train.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,pts_floor_ratio,gender_ratio,kg_park_ratio,high_ed_extent,pts_x_state,lifesq_x_state,floor_x_state,building_age,room_sq,floor_inverse
0,43.0,27.0,4.0,,,,,,,Investment,...,,0.885149,0.067498,1.221523,,,,,,
1,34.0,19.0,3.0,,,,,,,Investment,...,,0.81266,0.268308,1.849999,,,,,,
2,43.0,29.0,2.0,,,,,,,Investment,...,,0.824169,0.130991,3.219286,,,,,,
3,89.0,50.0,9.0,,,,,,,Investment,...,,0.867661,2.297865,1.317732,,,,,,
4,77.0,77.0,4.0,,,,,,,Investment,...,,0.922563,0.959783,1.520114,,,,,,


In [97]:
for c in train_x.columns:
    if train_x[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_x[c].values))
        train_x[c] = lbl.transform(list(train_x[c].values))

for c in test.columns:
    if test[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(test[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [98]:
ens.fit_predict(train_x,train_y,test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [99]:
base_models2 = [xgb1,xgb2,xgb3]
ens2 = Ensemble(5,stacker,base_models2)

In [None]:
ens2.fit_predict(train_x,train_y,test)

In [1]:
xgb1._Booster.save_model('hh')

NameError: name 'xgb1' is not defined