### 1. Data cleaning

#### 1.1. Data import

In [1]:
import pandas as pd

#take dataset that was previously scraped, preprocessed and analyzed
file_path = "../data/property_data.csv"
data = pd.read_csv(file_path)

#### 1.2. Duplicates and garbage

In [2]:
#no duplicates
data.drop_duplicates(inplace=True)  # drop duplicate rows

#drop irrelevant data
data.drop(['Raw num:','URL', 'ID number', 'Type of Sale', 'Locality', 'Zip code'], axis=1, inplace=True) # drop irrelevant columns

#### 1.3. NaNs

In [3]:
# drop rows that have NaN values in essential columns
data.dropna(subset=['Price of property in euro','Number of bedrooms','Living area'], inplace=True)

# fill NaN values with 0 in columns where this seems like a safe assumption
clean_values = {'Kitchen': 0, 'Terrace': 0, 'Garden': 0, 'Swimming pool': 0}
data.fillna(clean_values, inplace=True)

# drop (non-essential) columns which contain  NaN values
data.dropna(axis='columns', inplace=True)

#### [intermezzo: check for unicity of property types and subtypes]

In [4]:
unique_values_property = data['Type of property'].unique()
print("The types of property:", unique_values_property)

unique_values_subtype = data['Subtype of property'].unique()
print("The subtypes of property:", unique_values_subtype)

# extract unique values from the 'Subtype of property' column for houses and apartments
houses = data[data['Type of property'] == 'house']['Subtype of property'].unique()
apartments = data[data['Type of property'] == 'apartment']['Subtype of property'].unique()

print("The unique subtypes of houses:", houses)
print("The unique subtypes of apartments:", apartments)

# find the intersection of unique subtypes between houses and apartments
subtype_overlap = set(houses) & set(apartments)

if subtype_overlap:
    print("There is an overlap between subtypes of houses and apartments.")
    print("Overlapping subtypes:", subtype_overlap)
else:
    print("There is no overlap between subtypes of houses and apartments.")

# so we can drop the 'property type' column

The types of property: ['house' 'apartment']
The subtypes of property: ['house' 'exceptional property' 'villa' 'mansion' 'apartment' 'penthouse'
 'duplex' 'loft' 'mixed use building' 'flat studio' 'apartment block'
 'town house' 'country cottage' 'service flat' 'bungalow' 'ground floor'
 'triplex' 'kot' 'castle' 'other property' 'chalet' 'manor house'
 'farmhouse']
The unique subtypes of houses: ['house' 'exceptional property' 'villa' 'mansion' 'mixed use building'
 'apartment block' 'town house' 'country cottage' 'bungalow' 'castle'
 'other property' 'chalet' 'manor house' 'farmhouse']
The unique subtypes of apartments: ['apartment' 'penthouse' 'duplex' 'loft' 'flat studio' 'service flat'
 'ground floor' 'triplex' 'kot']
There is no overlap between subtypes of houses and apartments.


#### 1.4. Categorical and numerical data

In [5]:
#drop property type column since it is covered by the property subtype column
data.drop(['Type of property'], axis=1, inplace=True)

# Step 1: Perform one-hot encoding
one_hot_encoding = pd.get_dummies(data['Subtype of property'], prefix='Subtype', dtype=int)

# Step 2: Drop the original 'Subtype of property' column
data.drop('Subtype of property', axis=1, inplace=True)

# Step 3: Concatenate the one-hot encoded DataFrame with the original 'data' DataFrame
data = pd.concat([data, one_hot_encoding], axis=1)

# Assuming your DataFrame is named 'data'
data = data.astype(int)

print(str(data.shape[0]) + " rows \n" + str(data.shape[1]) + " columns")
display(data)

11013 rows 
30 columns


Unnamed: 0,Kitchen,Price of property in euro,Number of bedrooms,Living area,Terrace,Garden,Swimming pool,Subtype_apartment,Subtype_apartment block,Subtype_bungalow,...,Subtype_loft,Subtype_manor house,Subtype_mansion,Subtype_mixed use building,Subtype_other property,Subtype_penthouse,Subtype_service flat,Subtype_town house,Subtype_triplex,Subtype_villa
0,1,149000,2,105,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,319000,3,154,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,320000,3,88,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,960000,6,400,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,498000,5,140,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11191,1,379000,2,104,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11192,1,359000,2,98,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11193,1,139500,2,90,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11194,0,448000,1,46,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#select features and preprocess them as needed

In [7]:
#remove features that have too strong correlation between them

### 2. Data formatting

#### 2.1. Divide dataset for training and testing

In [9]:
from sklearn.model_selection import train_test_split

# Separate the target variable (Price of property in euro) from the features
X = data.drop('Price of property in euro', axis=1)
y = data['Price of property in euro']

# Perform the Train-Test split with an 80-20 ratio
# Random_state is set for testing/reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets to verify the split
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)

Training data shape: (8810, 29) (8810,)
Testing data shape: (2203, 29) (2203,)


In [11]:
from sklearn.preprocessing import MinMaxScaler

# Apply Min-Max scaling to the training data
min_max_scaler = MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)

# Apply Min-Max scaling to the testing data
X_test_scaled = min_max_scaler.transform(X_test)

### 3. Model selection

#### 3.1. Linear regression model

In [13]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

[ 259212.  633624.  386708. ... 1321508.  363448.  478796.]


In [None]:
#once your pipeline is fully ready, explore at least one more regression model

### 4. Train your model

#### 4.1. Train model on data

In [14]:
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

### 5. Model evaluation

In [None]:
'''
Let's evaluate your model with an appropriate metric. 
What is a good measure of performance for your model?

Try to answer following questions:
- How could you improve this result?
- Which part of the process has the most impact on the results?
- How should you divide your time working on this kind of project?
'''

# still to test

# Calculate R2 score on the test data
r2_score = model.score(X_test_scaled, y_test)
print("R2 Score:", r2_score)

"\nLet's evaluate your model with an appropriate metric. \nWhat is a good measure of performance for your model?\n\nTry to answer following questions:\n- How could you improve this result?\n- Which part of the process has the most impact on the results?\n- How should you divide your time working on this kind of project?\n"

In [None]:
# still to test

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error:", rmse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

### 6. Model explainability

In [None]:
'''
Next to the overall performance of the model, it is also important to be able to explain to some degree how a model forms its predictions, or what contributes to its performance. Think about these questions:
- Which are the most important features to predict the price? How do you know?
- How do the insights from your modeling relate to those from you data exploration? (i.e. is there any evidence from your visuals that support your modeling results?)

'''

'\nNext to the overall performance of the model, it is also important to be able to explain to some degree how a model forms its predictions, or what contributes to its performance. Think about these questions:\n- Which are the most important features to predict the price? How do you know?\n- How do the insights from your modeling relate to those from you data exploration? (i.e. is there any evidence from your visuals that support your modeling results?)\n\n'