# Notebook for Regression Modelling

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

### Import Data

In [2]:
docs = pd.read_csv('Transformed_train.csv')
docs.head()

Unnamed: 0,area_type_Super built-up Area,area_type_Plot Area,area_type_Built-up Area,area_type_Carpet Area,availability,location_0,location_1,location_2,location_3,location_4,...,location_7,location_8,location_9,location_10,location_11,size,total_sqft,bath,balcony,price
0,1.0,0.0,0.0,0.0,12,0,0,0,0,0,...,0,0,0,0,1,2,1056.0,2.0,1.0,39.07
1,0.0,1.0,0.0,0.0,0,0,0,0,0,0,...,0,0,0,1,0,4,2600.0,5.0,3.0,120.0
2,0.0,0.0,1.0,0.0,0,0,0,0,0,0,...,0,0,0,1,1,3,1440.0,2.0,3.0,62.0
3,1.0,0.0,0.0,0.0,0,0,0,0,0,0,...,0,0,1,0,0,3,1521.0,3.0,1.0,95.0
4,1.0,0.0,0.0,0.0,0,0,0,0,0,0,...,0,0,1,0,1,2,1200.0,2.0,1.0,51.0


### Scaling Data

In [3]:
# Optional Step
# Standard Scaling Price
scaled_features = docs.copy()
col_names = ['price']
features = scaled_features[col_names]
price_scaler = StandardScaler().fit(features.values)
features = price_scaler.transform(features.values)
scaled_features[col_names] = features
docs = scaled_features

In [4]:
X = docs.iloc[:, :-1].values
y = docs.iloc[:, -1].values

In [5]:
# Optional Step
# Scaling whole data
sc_X = StandardScaler()
X = sc_X.fit_transform(X)

### Train-Test Split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Modelling

In [7]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
LRresults = 1 - np.sqrt(np.square(np.log10(y_pred_lr +1) - np.log10(y_test +1)).mean())
print(LRresults)

0.8180441287670885


In [8]:
import xgboost as xgb
xg =xgb.XGBRegressor()
xg.fit(X_train, y_train)
y_pred_xg = xg.predict(X_test)
XGresults = 1 - np.sqrt(np.square(np.log10(y_pred_xg +1) - np.log10(y_test +1)).mean())
print(XGresults)

0.8724076504460967


In [9]:
import lightgbm as lgb
train_data=lgb.Dataset(X_train,label=y_train)
params = {'learning_rate':0.001}
lg= lgb.train(params, train_data, 100)
y_pred_lg = lg.predict(X_test)
LGresults = 1 - np.sqrt(np.square(np.log10(y_pred_lg +1) - np.log10(y_test +1)).mean())
print(LGresults)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 348
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 20
[LightGBM] [Info] Start training from score 0.005515
0.7766253303824164


In [10]:
from sklearn.ensemble import GradientBoostingRegressor
gb= GradientBoostingRegressor()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
GBresults = 1 - np.sqrt(np.square(np.log10(y_pred_gb +1) - np.log10(y_test +1)).mean())
print(GBresults)

0.8729817005295868


In [11]:
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor()
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
ADAresults = 1 - np.sqrt(np.square(np.log10(y_pred_ada +1) - np.log10(y_test +1)).mean())
print(ADAresults)

0.8259258708426329


In [12]:
from sklearn.ensemble import BaggingRegressor
from sklearn import tree
br = BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))
br.fit(X_train, y_train)
y_pred_br = br.predict(X_test)
BRresults = 1 - np.sqrt(np.square(np.log10(y_pred_br +1) - np.log10(y_test +1)).mean())
print(BRresults)

0.8725353456444195


In [13]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 50, random_state = 0)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
RFresults = 1 - np.sqrt(np.square(np.log10(y_pred_rf +1) - np.log10(y_test +1)).mean())
print(RFresults)

0.8781290532363268


In [14]:
from sklearn.svm import SVR
svr = SVR(kernel = 'rbf')
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)
SVRresults = 1 - np.sqrt(np.square(np.log10(y_pred_svr +1) - np.log10(y_test +1)).mean())
print(SVRresults)

0.8641751159899212


In [15]:
from sklearn.linear_model import RidgeCV
rcv = RidgeCV()
rcv.fit(X_train, y_train)
y_pred_rcv = rcv.predict(X_test)
RCVresults = 1 - np.sqrt(np.square(np.log10(y_pred_rcv +1) - np.log10(y_test +1)).mean())
print(RCVresults)

0.8182501645877711


In [16]:
from sklearn.ensemble import VotingRegressor
estimatorsvr = [('xg',xgb.XGBRegressor()),
              ('rf',RandomForestRegressor(n_estimators = 50, random_state = 0)),
              ('br',BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))),
              ('gb',GradientBoostingRegressor()),
              ('svr',SVR(kernel = 'rbf')),
              ('rcv',RidgeCV()),
              ('lr',LinearRegression())
              ]
vr = VotingRegressor(estimatorsvr, weights=[4,11,10,4,4,1,1])
vr.fit(X_train, y_train)    
y_pred_vr = vr.predict(X_test)
VRresults = 1 - np.sqrt(np.square(np.log10(y_pred_vr +1) - np.log10(y_test +1)).mean())
print(VRresults)

0.8827350180011972


In [17]:
from sklearn.ensemble import StackingRegressor
estimatorssr = [('xg',xgb.XGBRegressor()),
              ('rf',RandomForestRegressor(n_estimators = 50, random_state = 0)),
              ('br',BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))),
              ('gb',GradientBoostingRegressor()),
              ('svr',SVR(kernel = 'rbf'))
              ]
sr = StackingRegressor(estimators=estimatorssr,final_estimator=RandomForestRegressor(n_estimators=50,random_state=42))
sr.fit(X_train, y_train)    
y_pred_sr = sr.predict(X_test)
SRresults = 1 - np.sqrt(np.square(np.log10(y_pred_sr +1) - np.log10(y_test +1)).mean())
print(SRresults)

0.8767766105460618


In [18]:
# Artificial Neural Networks
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential

# Initialising the ANN
model = Sequential()
# Adding the input layer and the first hidden layer
model.add(Dense(64, activation = 'relu', input_dim = 21))
# Adding the second hidden layer
model.add(Dense(units = 128, activation = 'relu'))
model.add(Dropout(0.4))
# Adding the third hidden layer
model.add(Dense(units = 64, activation = 'relu'))
model.add(Dense(units = 128, activation = 'relu'))
model.add(Dropout(0.6))
# Adding the output layer
model.add(Dense(units = 1))
# Compiling the ANN
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
model.summary()
# Fitting the ANN to the Training set
model.fit(X_train, y_train, batch_size = 32, epochs = 200)

y_pred_ann = model.predict(X_test)
ANNresults = 1 - np.sqrt(np.square(np.log10(y_pred_ann +1) - np.log10(y_test +1)).mean())
print(ANNresults)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                1408      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 1

Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 1

Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
0.7049657933268092


## Predicting on Test Data

In [19]:
docs_test = pd.read_csv('Transformed_test.csv')
docs_test.head()

Unnamed: 0,area_type_Super built-up Area,area_type_Plot Area,area_type_Built-up Area,area_type_Carpet Area,availability,location_0,location_1,location_2,location_3,location_4,...,location_7,location_8,location_9,location_10,location_11,size,total_sqft,bath,balcony,price
0,1.0,0.0,0.0,0.0,0,0,0,0,0,1,...,0,1,0,1,0,2,1225.0,2.0,2.0,
1,0.0,1.0,0.0,0.0,0,0,0,0,0,0,...,1,0,0,1,1,9,2400.0,9.0,2.0,
2,0.0,1.0,0.0,0.0,4,0,0,0,0,0,...,1,0,1,1,1,4,1650.0,5.0,2.0,
3,1.0,0.0,0.0,0.0,0,0,0,0,1,0,...,0,0,0,0,0,3,1322.0,3.0,1.0,
4,1.0,0.0,0.0,0.0,0,0,0,0,1,0,...,0,1,0,0,0,2,1161.0,2.0,1.0,


In [20]:
X_new = docs_test.iloc[:, :-1].values
y_new = docs_test.iloc[:, -1].values

In [21]:
#Only if scaled training data
#Scaling X_new
X_new = sc_X.transform(X_new)

In [22]:
# If scaled
y_sr = list(price_scaler.inverse_transform(sr.predict(X_new)))

In [None]:
# If not scaled
y_xg = list(xg.predict(X_new))

In [None]:
# Create Submission CSV
df_T = pd.DataFrame(y_sr,columns = ['price'])
df_T.to_csv("SRResult_inverseScaled.csv",index=None)