In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import scipy.signal as signal
from scipy import stats

# Reading the data

In [None]:
data = pd.read_csv("realestate_train-1.csv")

## PREPROCESSING

<img src="prep.png" width="400"/>


#### p.s.: Since there is not missing or improper data, we may skip the integration and reduction steps.



#### Converting date object to a numerical value.
- Since our models cannot interpret string or date values, we have to numerialize them.

In [None]:
date_column = data["date"]
col = pd.to_datetime(date_column)
date_ordinal = col.map(dt.datetime.toordinal)
data["date"] = date_ordinal
data

- One-hot-encoding the type, in order to have numerically meaningful results, i.e. the types house and unit are going to be represented via 1s and 0s


In [None]:
type_one_hot_encoded = pd.get_dummies(data[["type"]])
data = data.drop(["type"],axis=1)
data = data.join(type_one_hot_encoded)


- Assigning target vector and feature matrix to the variables y and X respectively.


In [None]:
y = data[["price"]]
X = data.drop(["price"],axis=1)
X

## Data Cleaning
- Here, we clear the dataset of the outliers. Zscore method is used to remove the house samples which deviate with a z-score with an absolute value more than 3. In this case, 3 is arbitrary, but a commonly used value.

In [None]:
z = np.abs(stats.zscore(y))
threshold= 3
outlier_indices = np.where(z > threshold)[0]

In [None]:
# print(data["price"][remov[0]])
clean_X = X.drop(outlier_indices)
clean_y = y.drop(outlier_indices)
clean_X

## Data Transformation
- Normalization of the training dataset.
- For normalization, MinMaxScaler from sklearn.preprocessing is used, it is aimed to improve the accuracy and efficiency of algorithms we are going to use in the further steps.
- The result is, in fact, the same. However, the data is scaled between 0 and 1 for the sake of efficiency.
- p.s: We do not normalize the one-hot-encoded variables

In [None]:
X_normalized = np.copy(clean_X)

scaler = MinMaxScaler()

# Fit and transform the Date and Bedrooms columns of the data
fitted = scaler.fit(X_normalized[:,:2])

X_normalized = fitted.transform(X_normalized[:,:2])

house_column = clean_X[["type_house"]]
unit_column = clean_X[["type_unit"]]

X_normalized = np.concatenate((X_normalized, house_column,unit_column), axis=1)

X_normalized

# Training the model
### Using linear regression method from sklearn, the preprocessed data is fit.
- For our linear regression we use the feature matrix clean_X, and target vector i.e. price column of the dataset.

### Conclusion
- Model's coefficients can be interpreted as follows: If the weight corresponding to the "bedrooms" column is 1.17 x 10^5, an extra bedroom will lead to an increase of €117000 in the price.

- As it can be seen from the coefficients (model.coef_), not all of the features have a huge impact on the change of the price.  

In [82]:
model = LinearRegression()
model.fit(clean_X,clean_y)
model.coef_

array([[ 4.07138504e+01,  1.16848175e+05, -9.55526730e+03,
         9.55526730e+03]])

## Using LSTM Model from Keras, trained a Recurrent Neural Network

#### Long short-term memory structure

<img src="lstm_a.png" width="400"/>

#### LSTM network diagram

<img src="lstm_d.png" width="400"/>


- We can change the hyperparameter learning_rate and number of epochs. Learning rate is the most crucial hyperparameter of the algorithm. It must be chosen wisely.
- For optimization algorithm, we used Adam instead of Stochastic Gradient Descent as it is faster to converge.
- In order to avoid overfitting, we use a Dropout layer.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers

model_lstm = Sequential([layers.Input((4, 1)),
                    layers.LSTM(64),
                    layers.Dense(64, activation='relu'),
                    layers.Dropout(0.4),
                    layers.Dense(64, activation='relu'),
                    layers.Dense(1)])

model_lstm.compile(loss='mse', 
              optimizer=Adam(learning_rate=0.01),
              metrics=['mean_absolute_error'],
             )
    
model_lstm.fit(clean_X,clean_y, epochs=10)

# Testing
#### Test data is imported. The same numerializing steps above are applied to the test data.  
- We are going to use **hold out validation**. As the test data and the training data are given in separate files, we do not need to split the training data once again. The split ratio is %33. 

In [None]:
test_data = pd.read_csv("realestate_test.csv")

# Date column is converted into a numerical value
test_date_column = test_data["date"]
test_col = pd.to_datetime(date_column)
test_date_ordinal = test_col.map(dt.datetime.toordinal)
test_data["date"] = test_date_ordinal

# Since the feature type is a string value, we one-hot-encoded it.
test_type_one_hot_encoded = pd.get_dummies(test_data[["type"]])
test_data = test_data.drop(["type"],axis=1)
test_data = test_data.join(type_one_hot_encoded)

# Splitting test data into feature matrix and target vector
test_y = test_data[["price"]]
test_X = test_data.drop(["price"],axis=1)
test_X

- To test the model A, we used R² score testing from sklearn.
- Model A performs better than model B in this case since it is faster. 

In [None]:
from sklearn.metrics import r2_score
y_pred= model.predict(test_X)
print(r2_score(y_pred, test_y))
y_pred_lstm = model_lstm.predict(test_X)
print(r2_score(y_pred_lstm, test_y))

- As it can be concluded from the heatmap below, there is not a good amount of correlation between features. This means no feature has a direct impact on the price which, in turn, means the information provided is not useful to interpret price data. Therefore, I would recommend the real estate company to provide a more detailed and correlated data. 

In [None]:
sns.heatmap(data.corr(),annot=True)