In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pickle

In [2]:
data = pd.read_csv("train.csv") 

In [3]:
inputs = ['public_transport_station_km', 'trc_count_2000','leisure_count_500',
          'full_sq' ,'sub_area','university_top_20_raion']

In [4]:
data = data.drop('id',axis=1)

In [5]:
null_cols = data.columns[data.isnull().any()]
new_data = data.drop(null_cols, axis = 1)
num_data = new_data.select_dtypes(exclude=['object'])

In [6]:
cat_data = data.select_dtypes(exclude = ['int64','float64']).copy()

In [7]:
cat_data.drop('timestamp',axis=1,inplace=True)

In [8]:
dummies_sub = pd.get_dummies(cat_data.sub_area)
dummies_sub = dummies_sub.drop('Ajeroport',axis=1)
cat_data.drop('sub_area',axis=1,inplace=True)


In [9]:
dummies_ec = pd.get_dummies(cat_data.ecology)
dummies_ec = dummies_ec.drop('no data',axis=1)
cat_data.drop('ecology',axis=1,inplace=True)


In [10]:
labelencoder = LabelEncoder()

In [11]:
for column in cat_data:
    cat_data.loc[:,column] = labelencoder.fit_transform(cat_data.loc[:,column])

In [12]:
final_cat = pd.concat([cat_data,dummies_ec,dummies_sub] ,axis=1)

In [13]:
final1 = pd.concat([final_cat,num_data],axis=1)

In [14]:
# Create correlation matrix
corr_matrix = final1.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.9) and column not in inputs ]

In [15]:
final_nocorr = final1.drop(to_drop,axis=1)

In [16]:
nums = num_data.mean(axis=0).reset_index().T
nums.columns = nums.iloc[0]
nums = nums.reindex(nums.index.drop('index'))
cats = pd.DataFrame()
for column in dummies_sub:
    cats.loc[0,column]=0
for column in dummies_ec:
    cats.loc[0,column]=0
b = cat_data.mode(axis=0)
inp = pd.concat([b,cats,nums],axis=1)
inp = inp.drop(to_drop,axis=1)
inp = inp.drop('price_doc',axis=1)
# save the model to disk
filename_inp = 'input.csv'
pickle.dump(inp, open(filename_inp, 'wb'))

In [17]:
sc = StandardScaler()

In [18]:
x = final_nocorr.drop("price_doc", axis = 1)
y = final_nocorr.price_doc
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [19]:
X_train = sc.fit_transform(X_train)

In [20]:
X_test = sc.transform(X_test)

In [21]:
filename_sc = 'scalar.sav'
pickle.dump(sc, open(filename_sc, 'wb'))


In [22]:
model_gb = GradientBoostingRegressor(random_state=42)
gb = model_gb.fit(X_train,y_train)


In [23]:
filename_gb = 'gb_model.sav'
pickle.dump(gb, open(filename_gb, 'wb'))
 

In [24]:
r2_score(y_test, gb.predict(X_test))

0.6729041677502741

# Keras 

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
regressor = Sequential()

In [None]:
regressor.add(Dense(units = 800, kernel_initializer= 'uniform',activation='tanh',input_dim=257))
regressor.add(Dense(units = 800, kernel_initializer= 'uniform',activation='sigmoid'))
regressor.add(Dense(units = 1, kernel_initializer= 'uniform',activation='relu'))

In [None]:
from keras import backend as K
from keras.optimizers import Adam,SGD

In [None]:
def coeff_determination(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

regressor.compile(optimizer=Adam(),loss = 'mean_squared_error',metrics=[coeff_determination])

In [None]:
regressor.fit(X_train,y_train,batch_size=100,epochs=100)