### 1. Data cleaning

#### 1.1. Data import

In [5]:
import pandas as pd

def read_dataset(file_path):
    """
    Reads a dataset from a CSV file and returns it as a Pandas DataFrame.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    DataFrame: The dataset read from the CSV file.
    """
    data = pd.read_csv(file_path)
    return data
    

#### 1.2. Duplicates, garbage and NaN's

In [2]:
import pandas as pd

def clean_data(data):
    """
    Clean the input dataset by performing the following operations:
    1. Drop duplicate rows.
    2. Drop irrelevant columns.
    3. Drop rows with NaN values in essential columns.
    4. Fill NaN values with 0 in specified columns.
    5. Drop columns containing NaN values.

    Parameters:
    data (DataFrame): The input dataset to be cleaned.

    Returns:
    DataFrame: The cleaned dataset.
    """
    # Drop duplicate rows
    data.drop_duplicates(inplace=True)

    # Drop irrelevant columns
    data.drop(['Raw num:', 'URL', 'ID number', 'Type of Sale', 'Locality', 'Zip code'], axis=1, inplace=True)

    # Drop rows with NaN values in essential columns
    data.dropna(subset=['Price of property in euro', 'Number of bedrooms', 'Living area'], inplace=True)

    # Fill NaN values with 0 in specified columns
    clean_values = {'Kitchen': 0, 'Terrace': 0, 'Garden': 0, 'Swimming pool': 0}
    data.fillna(clean_values, inplace=True)

    # Drop columns containing NaN values
    data.dropna(axis='columns', inplace=True)

    return data

#### [intermezzo: check for unicity of property types and subtypes]

In [3]:
'''
# this code makes sure there is no overlap between the subtypes of houses and apartments

unique_values_property = data['Type of property'].unique()
print("The types of property:", unique_values_property)

unique_values_subtype = data['Subtype of property'].unique()
print("The subtypes of property:", unique_values_subtype)

# extract unique values from the 'Subtype of property' column for houses and apartments
houses = data[data['Type of property'] == 'house']['Subtype of property'].unique()
apartments = data[data['Type of property'] == 'apartment']['Subtype of property'].unique()

print("The unique subtypes of houses:", houses)
print("The unique subtypes of apartments:", apartments)

# find the intersection of unique subtypes between houses and apartments
subtype_overlap = set(houses) & set(apartments)

if subtype_overlap:
    print("There is an overlap between subtypes of houses and apartments.")
    print("Overlapping subtypes:", subtype_overlap)
else:
    print("There is no overlap between subtypes of houses and apartments.")

# so we can drop the 'property type' column

'''

'\n# this code makes sure there is no overlap between the subtypes of houses and apartments\n\nunique_values_property = data[\'Type of property\'].unique()\nprint("The types of property:", unique_values_property)\n\nunique_values_subtype = data[\'Subtype of property\'].unique()\nprint("The subtypes of property:", unique_values_subtype)\n\n# extract unique values from the \'Subtype of property\' column for houses and apartments\nhouses = data[data[\'Type of property\'] == \'house\'][\'Subtype of property\'].unique()\napartments = data[data[\'Type of property\'] == \'apartment\'][\'Subtype of property\'].unique()\n\nprint("The unique subtypes of houses:", houses)\nprint("The unique subtypes of apartments:", apartments)\n\n# find the intersection of unique subtypes between houses and apartments\nsubtype_overlap = set(houses) & set(apartments)\n\nif subtype_overlap:\n    print("There is an overlap between subtypes of houses and apartments.")\n    print("Overlapping subtypes:", subtype_over

#### 1.3. Categorical and numerical data

In [4]:
import pandas as pd

def preprocess_data(data):
    """
    Preprocess the input dataset by performing the following operations:
    1. Drop the 'Type of property' column.
    2. Perform one-hot encoding on the 'Subtype of property' column.
    3. Drop the original 'Subtype of property' column.
    4. Concatenate the one-hot encoded DataFrame with the original 'data' DataFrame.
    5. Convert the entire DataFrame to integer type.

    Parameters:
    data (DataFrame): The input dataset to be preprocessed.

    Returns:
    DataFrame: The preprocessed dataset.
    """
    # Drop the 'Type of property' column
    data.drop(['Type of property'], axis=1, inplace=True)

    # Perform one-hot encoding on the 'Subtype of property' column
    one_hot_encoding = pd.get_dummies(data['Subtype of property'], prefix='Subtype', dtype=int)

    # Drop the original 'Subtype of property' column
    data.drop('Subtype of property', axis=1, inplace=True)

    # Concatenate the one-hot encoded DataFrame with the original 'data' DataFrame
    data = pd.concat([data, one_hot_encoding], axis=1)

    # Convert the entire DataFrame to integer type
    data = data.astype(int)

    return data

In [15]:
#remove features that have too strong correlation between them

### 2. Data formatting

#### 2.1. Divide dataset for training and testing

In [6]:
def train_test_split_data(data, target_col='Price of property in euro', test_size=0.2, random_state=42):
    # Separate the target variable (Price of property in euro) from the features
    X = data.drop(target_col, axis=1)
    y = data[target_col]

    # Perform the Train-Test split with the specified ratio and random_state
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [7]:
def scale_data(X_train, X_test):
    scaler = MinMaxScaler()
    # Apply Min-Max scaling to the training data
    X_train_scaled = scaler.fit_transform(X_train)

    # Apply Min-Max scaling to the testing data
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled

### 3. Model selection, model training, model scoring

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score

def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate R2 score on the test data
    r2_score = model.score(X_test, y_test)
    print("R2 Score:", r2_score)

    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", int(mse))

    # Calculate RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print("Root Mean Squared Error:", int(rmse))

    # Calculate MAE
    mae = mean_absolute_error(y_test, y_pred)
    print("Mean Absolute Error:", int(mae))

    # Perform cross-validation (with num_folds deciding n-fold cross-validation)
    num_folds = 5
    scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error')

    # Convert the scores from negative MSE to positive RMSE
    rmse_scores = -scores

    # Calculate the mean and standard deviation of RMSE scores
    mean_rmse = rmse_scores.mean()
    std_rmse = rmse_scores.std()

    print("Mean RMSE:", int(mean_rmse))
    print("Standard Deviation of RMSE:", int(std_rmse))

    # Return the model and the R2 score
    return model, r2_score

### 4. Model evaluation

#### 4.1. Errors

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", int(mse))

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error:", int(rmse))

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", int(mae))

#### 4.2. Crossvalidation

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation (with num_folds deciding n-fold cross-validation)
num_folds = 5
scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error')

# Convert the scores from negative MSE to positive RMSE
rmse_scores = -scores

# Calculate the mean and standard deviation of RMSE scores
mean_rmse = rmse_scores.mean()
std_rmse = rmse_scores.std()

print("Mean RMSE:", int(mean_rmse))
print("Standard Deviation of RMSE:", int(std_rmse))

### 5. Main

In [None]:
# Load the scraped info from the .csv file into a dataframe
file_path = "../data/property_data.csv"
data = read_dataset(file_path)

# Clean the data of duplicates, irrelevant columns and NaN's
data = clean_data(data)

# Preprocess data to deal with categorical and numerical data
data = preprocess_data(data)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split_data(data)

# Scale the data
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)

# Train and evaluate the linear regression model
linear_regression_model = LinearRegression()
linear_regression_model, linear_regression_score = train_and_evaluate_model(linear_regression_model, X_train_scaled, X_test_scaled, y_train, y_test)

# Train and evaluate the XGBoost model
xgboost_model = XGBRegressor()
xgboost_model, xgboost_score = train_and_evaluate_model(xgboost_model, X_train_scaled, X_test_scaled, y_train, y_test)

# Use the linear_regression_score and xgboost_score as needed
print("Linear Regression Score:", linear_regression_score)
print("XGBoost Score:", xgboost_score)