https://www.youtube.com/watch?v=Wqmtf9SA_kk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from utils import *

In [None]:
path = "../nybolig-scrape/"
data = pd.read_csv(path+"nybolig_data.csv")
postal_codes = (1000, 2900)
data = data[(data['postal_code'] >= postal_codes[0]) & (data['postal_code'] <= postal_codes[1])]

#Count the number of types 
data = data[(data['type'] == 'ejerlejlighed')]

display(data['type'].value_counts())
display(data.head())
display(data.info())

# Data Exploration

In [None]:
from sklearn.model_selection import train_test_split
X = data.drop(['price'], axis = 1)
y = data['price']

In [None]:
data = data[(data['postal_code'] >= 1000) & (data['postal_code'] <= 2920)]

# set the data to only "ejerlejlighed" on its type
data = data[data['type'] == 'ejerlejlighed']

len(data)

In [None]:
data.hist(figsize = (15, 8))

In [None]:
plt.figure(figsize = (15, 8))
sns.heatmap(data.select_dtypes(include = np.number).corr(), annot = True, cmap = "YlGnBu")

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
from geopy.geocoders import Nominatim
import re

loc = Nominatim(user_agent = "GetLoc")
def get_lat_long(addresses): 
    #Get the latitude and longitude of the address
    long, lat = [], []
    for address in addresses:
        match = re.search(r"(.*?)\d", address)
        address = match.group(1)
        locations = loc.geocode(address, exactly_one = False)
        #Select location from location that is clostest to (55, 12)
        location = min(locations, key = lambda loc: (loc.latitude - 55)**2 + (loc.longitude - 12)**2)

        #Verify that the location is within (54,12) and (56, 13)
        if location.latitude < 54 or location.latitude > 56 or location.longitude < 12 or location.longitude > 13:
            long.append(None)
            lat.append(None)
        else:
            long.append(location.longitude)
            lat.append(location.latitude)
    return lat, long

def transformation(data):
    data['postal_code'] = (data['postal_code'] + 1) ** 2
    data['rooms'] = np.log(data['rooms'] + 1)
    data['size'] = np.log(data['size'] + 1)
    data['basement_size'] = np.log(data['basement_size'] + 1)
    data['year_built'] = (data['year_built'] + 1) ** 2
    data['year_rebuilt'] = (data['year_rebuilt'] + 1) ** 2
    return data

def encode(data, encoding = 'normal'):
    if encoding == 'normal':
        data['type'] = data['type'].astype('category').cat.codes
        data['energy_label'] = data['energy_label'].astype('category').cat.codes
        data['postal_code'] = data['postal_code'].astype('category').cat.codes
    elif encoding == 'onehot':
        data = pd.get_dummies(data, columns = ['type', 'energy_label', "postal_code"], drop_first = True, dtype=int)
    else:
        raise ValueError("The encoding parameter must be either 'normal' or 'onehot'")
    return data

def drop_low_corr_(data):
    for column in data.select_dtypes(include = np.number).columns:
        if abs(data[column].corr(data['price'])) < 0.1:
            data = data.drop(column, axis = 1)
    return data

def preprocess_data(data, 
                    transformations: bool = False, 
                    encoding: str = 'normal', 
                    drop_low_corr: bool = False,
                    long_lat: bool = False):
    # Fill the missing values
    data['year_rebuilt'] = data['year_rebuilt'].where(~data['year_rebuilt'].isna(), data['year_built']).astype(int)
    data['basement_size'] = data["basement_size"].fillna(0)

    #Set the longitude and latitude
    if long_lat:
        data['latitude'], data['longitude'] = get_lat_long(data['address'])
        

    # Apply the transformations
    if transformations:
        data = transformation(data)
        
    #Set label as above or below mean: 
    mean = data['price'].mean()
    data['above_mean'] = data['price'].apply(lambda x: 1 if x > mean else 0)
    
    # Apply the encoding
    data = encode(data, encoding)

    # Drop the columns that have low correlation with the target variable
    if drop_low_corr:
        drop_low_corr_(data)
        
    # Drop the columns that are not needed
    data = data.drop(['url', 'address'], axis = 1)

    return data

In [None]:
preprocessed_data = preprocess_data(data, transformations = False, encoding = 'normal', drop_low_corr = True, long_lat = False)

In [None]:
preprocessed_data.hist(figsize = (15, 8))
plt.figure(figsize = (15, 8))
sns.heatmap(preprocessed_data.select_dtypes(include = np.number).corr(), annot = True, cmap = "YlGnBu")

#  Price Regression 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data.drop(['price', 'above_mean'], axis = 1), preprocessed_data['price'], test_size = 0.2, random_state = 0)
display(X_train)
display(y_train)

## Linear, Lasso and Ridge Regression 

In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
warnings.simplefilter(action='ignore', category=FutureWarning)
np.set_printoptions(suppress=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def regression(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    score = cross_val_score(model, x_train, y_train, cv=5)
    print("Scores", score)
    print("Test score: ", model.score(x_test, y_test))
    print("\nCoefficients: ")
    for feature, coef in zip(x_train.columns, model.coef_):
        print(f"{feature}: {coef}")
    print("\nIntercept: ", model.intercept_)

    # Predicting the test set results
    y_pred = model.predict(x_test)
    
    #Evaluating the model
    print("\nEvaluation")
    print("Mean squared error: ", mean_squared_error(y_test, y_pred))
    print("Mean absolute error: ", mean_absolute_error(y_test, y_pred))
    print("R2 score: ", r2_score(y_test, y_pred))

    # Plotting the results
    plot_regression_results(model.__class__.__name__, y_test, y_pred)

linear_model_ = linear_model.LinearRegression()
lasso_model = linear_model.Lasso(alpha = 10)
ridge_model = linear_model.Ridge(alpha = 10)
regression(linear_model_, X_train, y_train, X_test, y_test)
regression(ridge_model, X_train, y_train, X_test, y_test)
regression(lasso_model, X_train, y_train, X_test, y_test)

For this results, we have the following: 
  1. Scores: These are cross-validation scores obtained using 5-fold cross-validation. They represent the R-squared values achieved by the Lasso model on different folds of the training data. Each score corresponds to one fold.
  2. Test score: This is the R-squared score of the model on the held-out test set.
  3. Coefficients: These are the weights assigned to each feature by the  model. 
  4. Intercept: This is the bias term of the model.

## Random Forest 

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(random_state = 0)
forest.fit(X_train, y_train)

In [None]:
print("Feauture Importance")
for feature, importance in zip(X_train.columns, forest.feature_importances_):
    print((feature), round(importance,4))
print("R2 score: ", forest.score(X_test, y_test))
print("Mean squared error: ", mean_squared_error(y_test, forest.predict(X_test)))
print("Mean absolute error: ", mean_absolute_error(y_test, forest.predict(X_test)))

plot_regression_results(forest.__class__.__name__, y_test, forest.predict(X_test))

## Hyperparameter Tuning

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [500, 1000, 1500],
#     'max_features': [3, 4, 5],
# }

# grid_search = GridSearchCV(forest, param_grid, cv = 5, scoring='neg_mean_squared_error', return_train_score=True)
# grid_search.fit(X_train, y_train)
# best_forest = grid_search.best_estimator_
# best_forest

In [None]:
# best_forest.score(X_test, y_test)

In [None]:
# print("Best parameters: ", grid_search.best_params_)
# print("R2 score: ", best_forest.score(X_test, y_test))
# print("Mean squared error: ", mean_squared_error(y_test, best_forest.predict(X_test)))
# print("Mean absolute error: ", mean_absolute_error(y_test, best_forest.predict(X_test)))

# plot_regression_results(best_forest.__class__.__name__, y_test, best_forest.predict(X_test))

## Extreme Gradient Boosting 

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBRegressor(objective = 'reg:squarederror', n_estimators = 300, learning_rate = 0.1, max_depth = 3)
xgb_model.fit(X_train, y_train)
pred = xgb_model.predict(X_test)
print("R2 score: ", r2_score(y_test, pred))
print("Mean squared error: ", mean_squared_error(y_test, pred))
print("Mean absolute error: ", mean_absolute_error(y_test, pred))

plot_regression_results(xgb_model.__class__.__name__, y_test, pred)


## Neural Network Apporach 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
def neural_network(x_train, y_train, x_test, y_test): 
  model = Sequential()

  #Adding layers 
  num_features = x_train.shape[1]
  model.add(Dense(64, input_dim = num_features, activation = 'relu'))
  model.add(Dense(32, activation = 'relu'))
  model.add(Dense(16, activation = 'relu'))
  model.add(Dense(8, activation = 'relu'))
  model.add(Dense(1, activation = 'linear'))

  #Compiling and fitting 
  model.compile(optimizer = 'adam', loss = 'mean_squared_error')
  model.fit(x_train, y_train, epochs = 400, batch_size = 32, verbose = 0)
  print("Test score: ", model.evaluate(x_test, y_test))

  y_pred = model.predict(x_test).flatten()

  #Evaluating the model
  print("\nEvaluation")
  print("R2 score: ", r2_score(y_test, y_pred))
  print("Mean squared error: ", mean_squared_error(y_test, y_pred))
  print("Mean absolute error: ", mean_absolute_error(y_test, y_pred))
  # Predicting the test set results
  plot_regression_results(xgb_model.__class__.__name__, y_test, y_pred)
  return None

In [None]:
neural_network(X_train, y_train, X_test, y_test)

neural_network(X_train, y_train, X_test, y_test)

# Binary Regression

In [None]:
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(preprocessed_data.drop(['price', 'above_mean'], axis = 1), preprocessed_data['above_mean'], test_size = 0.2, random_state = 0)
display(X_train_binary)

## Random Forrest Binary Regression

In [None]:
# Do the same for the binary classification
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

logistic_regression = LogisticRegression()
gradient_boosting = GradientBoostingClassifier()
random_forest = RandomForestClassifier()

def classification(model, x_train, y_train, x_test, y_test):
    print(model.__class__.__name__)
    model.fit(x_train, y_train)
    score = cross_val_score(model, x_train, y_train, cv=5)
    print("Cross Val Scores", score)
    print("Test score: ", model.score(x_test, y_test))
    print("Accuracy: ", accuracy_score(y_test, model.predict(x_test)))

    return None

classification(gradient_boosting, X_train_binary, y_train_binary, X_test_binary, y_test_binary)
classification(random_forest, X_train_binary, y_train_binary, X_test_binary, y_test_binary)
#classification(logistic_regression, X_train_binary, y_train_binary, X_test_binary, y_test_binary)