In [None]:
import pandas as pd
from IPython.display import display
import math
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

In [None]:
dataset = pd.read_csv("../data/assignment_preprocessed.csv")
dataset.drop(['id','price_per_sqm', 'agent_id'], axis=1, inplace=True)

**Categorical to numerical data**
Can be done either with the use of mapping or with one-hot encoding. We will use mapping for the correlation analysis and one-hot encoding for the model training.

In [None]:
ad_type_mapping = {'simple': 1, 'up': 2, 'premium': 3, 'star': 4}
geo_mapping = {'northern sub': 1, 'south beach': 2, 'gentrification area': 3, 'beesy neighborhood': 4}
floor_mapping = {'ground-floor': 0, 'basement': -1, 'mezzanine': 0.5, 'semi-basement': -0.5}
subtype_mapping ={'apartment': 1, 'detached': 2, 'maisonette': 3, 'building': 4, 'villa': 5, 'studio': 6, 'other residential': 7, 'loft': 8, 'apartment complex': 9, 'bungalow': 10}

In [None]:
dataset['geography_name'] = dataset['geography_name'].map(geo_mapping).fillna(dataset['geography_name']).astype(int)
dataset['ad_type'] = dataset['ad_type'].map(ad_type_mapping).fillna(dataset['ad_type']).astype(int)
dataset['floor'] = dataset['floor'].map(floor_mapping).fillna(dataset['floor']).astype(float)
dataset['subtype'] = dataset['subtype'].map(subtype_mapping).fillna(dataset['subtype']).astype(int)

## Ways to identify the most important attributes in predicting the price of a property

### Correlation Analysis

In [None]:
def correlation_analysis(dataset, target_variable):
    # calculate correlation matrix
    corr_matrix = dataset.corr()

    # get correlation with target variable
    corr_with_target = corr_matrix[target_variable]

    # sort correlation values in descending order
    sorted_corr = corr_with_target.abs().sort_values(ascending=False)

    # print top 10 attributes with highest correlation
    return list(sorted_corr.iloc[1:11].keys())

In [None]:
correlation_analysis_list = correlation_analysis(dataset, 'price')

### Feature Importance using Decision Trees 

In [None]:
X = dataset.drop(['price'], axis=1)
y = dataset['price']

In [None]:
# create decision tree model
model = DecisionTreeRegressor()

# fit model to data
model.fit(X, y)

In [None]:
# get feature importances
importances = model.feature_importances_

# sort importances in descending order
sorted_importances = importances.argsort()[::-1]

# print top 10 attributes with highest importance
decision_tree_list = list(X.columns[sorted_importances[:10]])

### Feature Importance using Random Forest

In [None]:
rf = RandomForestRegressor(n_estimators=150)
rf.fit(X, y)

In [None]:
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

In [None]:
random_forest_list = list(feature_importance.head(10)['Feature'])

## Price Prediction

In [None]:
# attributes that at least two of the ways found as an important feature
important_attributes = list(set(correlation_analysis_list).intersection(decision_tree_list, random_forest_list) | set(decision_tree_list).intersection(random_forest_list) | set(correlation_analysis_list).intersection(random_forest_list))

In [None]:
dataset = pd.read_csv("../data/assignment_preprocessed.csv")
dataset.drop(['id','price_per_sqm', 'agent_id'], axis=1, inplace=True)

In [None]:
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('object')):
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            #drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df

In [None]:

def get_results(y_test, y_pred): # Evaluate model performance
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print('Mean squared error:', round(mse))
    print('Root MSE:', round(math.sqrt(mse)))
    print("RMSLE",np.log(np.sqrt(mean_squared_error(y_test,y_pred))))
    print('R2 score:', r2)

In [None]:
print('There were {} columns before encoding categorical features'.format(dataset.shape[1]))
encoded_dataset = oneHotEncode(dataset, list(dataset.select_dtypes(include=['object']).columns))
print('There are {} columns after encoding categorical features'.format(encoded_dataset.shape[1]))

In [None]:
encoded_important_attributes = list(dataset[important_attributes].select_dtypes(exclude=['object']).columns)

for obj in list(dataset[important_attributes].select_dtypes(include=['object']).columns):
    prefix = obj + '_'
    encoded_important_attributes += [prefix + str(val) for val in dataset[obj].unique()]

In [None]:
# Split data into training and testing sets
X = encoded_dataset[encoded_important_attributes]
y = encoded_dataset['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#### RandomForestRegressor

In [None]:
# Fit Random Forest Regression model
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

rf.score(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
get_results(y_test, y_pred)

#### GradientBoostingRegressor

In [None]:
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

gb.score(X_train, y_train)

In [None]:
# Predict prices for test data
y_pred = gb.predict(X_test)

In [None]:
get_results(y_test, y_pred)

#### XGBRegressor

In [None]:
XGBModel = XGBRegressor()
XGBModel.fit(X_train, y_train, verbose=False)


In [None]:
# Get the mean absolute error on the validation data :
y_pred = XGBModel.predict(X_test)

In [None]:
get_results(y_test, y_pred)

#### Neural Network

In [None]:
model = Sequential()

# The Input Layer :
model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(1, kernel_initializer='normal',activation='linear'))

In [None]:
# Compile the network :
model.compile(loss='mean_squared_logarithmic_error', optimizer='adam', metrics=['mse', 'mae'])
model.summary()

In [None]:
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)

In [None]:
model.fit(X_train, y_train, epochs=250, batch_size=32, validation_split = 0.2, verbose=0)

In [None]:
y_pred = model.predict(X_test)

In [None]:
get_results(y_test, y_pred)