https://www.youtube.com/watch?v=Wqmtf9SA_kk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
data = pd.read_csv("nybolig_data.csv")
display(data.head())
display(data.info())

In [1488]:
data

Unnamed: 0,url,address,postal_code,type,price,size,basement_size,rooms,year_built,year_rebuilt,energy_label
0,https://www.nybolig.dk/villa/3100/arfriisvej/2...,A R Friis Vej 9 3100 Hornbæk,3100,villa,9975000,269,12.0,9,1895,1999.0,D
1,https://www.nybolig.dk/fritidshus/7990/akoldin...,A. Koldings Vej 8 Sillerslev 7990 Øster Assels,7990,fritidsbolig,1495000,96,0.0,4,2008,,
2,https://www.nybolig.dk/villa/6400/aabenraavej/...,Aabenraavej 103 Ragebøl 6400 Sønderborg,6400,villa,1295000,219,0.0,7,1924,,E
3,https://www.nybolig.dk/villa/6100/aabenraavej/...,Aabenraavej 59 6100 Haderslev,6100,villa,2495000,226,35.0,6,1947,2010.0,C
4,https://www.nybolig.dk/villa/9240/aagade/27020...,Aagade 4 Vegger 9240 Nibe,9240,villa,495000,159,59.0,4,1958,,D
...,...,...,...,...,...,...,...,...,...,...,...
5420,https://www.nybolig.dk/villa/8210/oestrevej/25...,Østrevej 5 Hasle 8210 Aarhus V,8210,villa,5248000,111,90.0,6,1928,2012.0,C
5421,https://www.nybolig.dk/villa/8930/oestrupvej/2...,Østrupvej 11 Albæk 8930 Randers NØ,8930,villa,995000,231,0.0,6,1850,2006.0,E
5422,https://www.nybolig.dk/villa/7130/oestrupvej/2...,Østrupvej 26 Glud 7130 Juelsminde,7130,villa,895000,83,6.0,4,1952,,D
5423,https://www.nybolig.dk/villa/7130/oestrupvej/2...,Østrupvej 49 Glud 7130 Juelsminde,7130,villa,3995000,241,0.0,7,1986,2008.0,A2010


In [1489]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5425 entries, 0 to 5424
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   url            5425 non-null   object 
 1   address        5425 non-null   object 
 2   postal_code    5425 non-null   int64  
 3   type           5425 non-null   object 
 4   price          5425 non-null   int64  
 5   size           5425 non-null   int64  
 6   basement_size  4528 non-null   float64
 7   rooms          5425 non-null   int64  
 8   year_built     5425 non-null   int64  
 9   year_rebuilt   2049 non-null   float64
 10  energy_label   4789 non-null   object 
dtypes: float64(2), int64(5), object(4)
memory usage: 466.3+ KB


In [None]:
from sklearn.model_selection import train_test_split
X = data.drop(['price'], axis = 1)
y = data['price']

In [None]:
data.hist(figsize = (15, 8))

In [None]:
plt.figure(figsize = (15, 8))
sns.heatmap(data.select_dtypes(include = np.number).corr(), annot = True, cmap = "YlGnBu")

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
def preprocess_data(data, transformations: bool = False, encoding: str = 'normal', drop_low_corr: bool = False):
    # Fill the missing values
    data['year_rebuilt'] = data['year_rebuilt'].where(~data['year_rebuilt'].isna(), data['year_built']).astype(int)
    data['basement_size'] = data["basement_size"].fillna(0)
    # Drop the columns that are not needed
    data = data.drop(['url', 'address'], axis = 1)

    # Apply the transformations
    if transformations:
        data['postal_code'] = (data['postal_code'] + 1) ** 2
        data['rooms'] = np.log(data['rooms'] + 1)
        data['size'] = np.log(data['size'] + 1)
        data['basement_size'] = np.log(data['basement_size'] + 1)
        data['year_built'] = (data['year_built'] + 1) ** 2
        data['year_rebuilt'] = (data['year_rebuilt'] + 1) ** 2

    #if scaling: 
    #   scaler = StandardScaler()
    #   x_data = pd.DataFrame(scaler.fit_transform(x_data), columns = x_data.columns)

    # Encode the categorical variables
    if encoding == 'normal':
        data['type'] = data['type'].astype('category').cat.codes
        data['energy_label'] = data['energy_label'].astype('category').cat.codes
        #Another way of doing it 
        #data['type'] = data().LabelEncoder().fit_transform(data['type'])
        #data['energy_label'] = data().LabelEncoder.fit_transform(data['energy_label'])
    elif encoding == 'onehot':
        data = pd.get_dummies(data, columns = ['type', 'energy_label'], drop_first = True, dtype=int)
    else:
        raise ValueError("The encoding parameter must be either 'normal' or 'onehot'")

    # Drop the columns that have low correlation with the target variable
    if drop_low_corr:
        for column in data.select_dtypes(include = np.number).columns:
            if abs(data[column].corr(data['price'])) < 0.1:
                data = data.drop(column, axis = 1)
        
    return data

In [None]:
preprocessed_data = preprocess_data(data, transformations = False, encoding = 'onehot', drop_low_corr = True)

In [None]:
preprocessed_data.hist(figsize = (15, 8))

In [None]:
plt.figure(figsize = (15, 8))
sns.heatmap(preprocessed_data.select_dtypes(include = np.number).corr(), annot = True, cmap = "YlGnBu")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data.drop(['price'], axis = 1), preprocessed_data['price'], test_size = 0.2, random_state = 0)
X_train

# Regression 

## Lasso and Ridge Regression 

In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
warnings.simplefilter(action='ignore', category=FutureWarning)
np.set_printoptions(suppress=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def regression(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    score = cross_val_score(model, x_train, y_train, cv=5)
    print("Scores", score)
    print("Test score: ", model.score(x_test, y_test))
    print("\nCoefficients: ")
    for feature, coef in zip(x_train.columns, model.coef_):
        print(f"{feature}: {coef}")
    print("\nIntercept: ", model.intercept_)

    # Predicting the test set results
    y_pred = model.predict(x_test)

    # Calculate residuals
    residuals = y_pred - y_test

    # Calculate distances from the perfect fit line
    distances = np.abs(y_test - y_pred)

    # Define color gradient based on distances
    colors = distances / np.max(distances)  # Normalize distances to range [0, 1]
    # colors = plt.cm.RdYlGn_r(colors)  # Reverse the colormap: green (furthest), red (closest)

    # Plot true values vs predictions with color gradient
    plt.scatter(y_test, y_pred, c=colors)
    plt.xlabel('True values')
    plt.ylabel('Predictions')
    # Plot the perfect fit line
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], c='r')
    # Name the perfect fit line
    plt.legend(['Test values', 'Perfect fit'])
    plt.title(f'True values vs Predictions ({model.__class__.__name__})')
    plt.colorbar(label='Distance from Diagonal')
    plt.show()

    # Plot residuals
    plt.scatter(y_pred, residuals, c=colors)
    plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max(), colors='r')
    plt.title(f'Residual plot ({model.__class__.__name__})')
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.colorbar(label='Distance from Diagonal')
    plt.legend(['Residuals', 'Perfect fit'])
    plt.show()


#linear_model_ = linear_model.LinearRegression()
lasso_model = linear_model.Lasso(alpha = 10)
ridge_model = linear_model.Ridge(alpha = 10)
regression(ridge_model, X_train, y_train, X_test, y_test)
regression(lasso_model, X_train, y_train, X_test, y_test)

For this results, we have the following: 
  1. Lasso Scores: These are cross-validation scores obtained using 5-fold cross-validation. They represent the R-squared values achieved by the Lasso model on different folds of the training data. Each score corresponds to one fold.
  2. Lasso test score: This is the R-squared score of the model on the held-out test set.
  3. Lasso coefficients: These are the weights assigned to each feature by the Lasso model. 
  4. Lasso intercept: This is the bias term of the model.

# Extreme Gradient Boosting 