## Importing the dataset

In [None]:
# import the rome_rents_clean.csv dataset into a pandas dataframe
import pandas as pd
import numpy as np

df = pd.read_csv('italy-house-prices/rome_rents_clean.csv')
df.head()

## Data Preprocessing

In [None]:
# remove datetime column 
df = df.drop(columns=['datetime'])
df = df.replace(['invalid', 'None', 'nan', ','], np.nan)
df = df.dropna(how='any')  # Drop rows with any missing values

In [None]:
# divide "stato" column into two columns: "condizioni" and "ristrutturato"
df[['condizioni', 'ristrutturato']] = df['stato'].str.split(' / ', expand=True)
df = df.drop(columns=['stato'])
print(df['condizioni'].unique())
# transform into numerical values the "stato" column
traslations_dict = {
    'da ristrutturare': 0,
    'buono': 1,
    'ottimo': 2,
    'nuovo': 3
}
# remove the "ristrutturato" column, instead create a int column "in costruzione" which is 1 if "ristrutturato" is "in costruzione", 0 otherwise
df['in costruzione'] = (df['ristrutturato'] == 'in costruzione')
df = df.drop(columns=['ristrutturato'])
df['condizioni'] = df['condizioni'].map(traslations_dict)
# translate the classe energetica column into numerical values
# first print all different values in the "classe energetica" column
print("Different values in 'classe energetica' column:")
print(df['classe energetica'].unique())
# then create a dictionary to map the values
classe_energetica_dict = {
    'A+': 7,
    'A': 6,
    'B': 5,
    'C': 4,
    'D': 3,
    'E': 2,
    'F': 1,
    'G': 0
}
df['classe energetica'] = df['classe energetica'].map(classe_energetica_dict)
df['posti auto'] = df['posti auto'].astype('int8')
df['bagni'] = df['bagni'].astype('int8')
df['stanze'] = df['stanze'].astype('int8')
df.info()

In [None]:
# remove rows where it is the only one with that value in "quartiere" column
quartiere_counts = df['quartiere'].value_counts()
rare_quartieri = quartiere_counts[quartiere_counts == 1].index
df = df[~df['quartiere'].isin(rare_quartieri)]

In [None]:
# import the file quartieri_mapping.csv which contains more info about each quartiere
quartieri_df = pd.read_csv('italy-house-prices/quartieri_mapping.csv')
# merge the two dataframes on the "quartiere" column
df = df.merge(quartieri_df, left_on='quartiere', right_on='Quartiere', how='left')
# drop the "Quartiere" column
df = df.drop(columns=['Quartiere'])
# represent "Zone" column as categorical values using one-hot encoding
zone_dummies = pd.get_dummies(df['Zone'], prefix='Zone')
df = pd.concat([df, zone_dummies], axis=1)
df = df.drop(columns=['Zone'])
# represent "Zone_" columns as int
for col in zone_dummies.columns:
    df[col] = df[col].astype(int)
# remove rows where any value is NaN
df = df.dropna(how='any')

# one-hot encode the "quartiere" column
quartiere_dummies = pd.get_dummies(df['quartiere'], prefix='quartiere')
df = pd.concat([df, quartiere_dummies], axis=1)
df = df.drop(columns=['quartiere'])
# represent "quartiere_" columns as int
for col in quartiere_dummies.columns:
    df[col] = df[col].astype(int)

df.info()

## Data Augmentation

In [None]:
# add feature "superficie**2"
df['superficie_squared'] = df['superficie'] ** 2
# add feature "mq_per_stanza"
df['mq_per_stanza'] = df['superficie'] / df['stanze']
# add feature "stanze totali"
df['stanze_totali'] = df['stanze'] + df['bagni']
# add feature "prezzo_per_mq"
df['prezzo_al_mq'] = df['prezzo'] / df['superficie']

In [None]:
# create the target variable for classification: y_classificazione
# if prezzo_al_mq is greater than mean_prezzo_al_mq, then y_classificazione is 1, otherwise 0

mean_prezzo_al_mq = df['prezzo_al_mq'].mean()
df['y_classificazione'] = (df['prezzo_al_mq'] > mean_prezzo_al_mq).astype(int)
# print also prezzo al mq mean of Rome rents
print(f"Mean prezzo al mq of Rome rents: {mean_prezzo_al_mq}")
df.head()

### Outlier Handling

In [None]:
# Remove outliers based on 'superficie' and 'prezzo' columns using the IQR method
Q1 = df[['superficie', 'prezzo']].quantile(0.20)
Q3 = df[['superficie', 'prezzo']].quantile(0.80)
IQR = Q3 - Q1
filter = ~((df[['superficie', 'prezzo']] < (Q1 - 1.5 * IQR)) | (df[['superficie', 'prezzo']] > (Q3 + 1.5 * IQR))).any(axis=1)
cleaned_data = df.loc[filter].reset_index(drop=True)
cleaned_data.info()

## Data Normalization

In [None]:
## Normalize the numerical features using StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
# split into training, validation, and test set
from random import randint
# get random number to use as random_state
random_state = randint(0, 10000)
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=random_state)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=random_state)
# separate features and target variable for regression and classification
X_train = train_df.drop(columns=['prezzo', 'y_classificazione'])
y_train_regression = train_df['prezzo']
y_train_classification = train_df['y_classificazione']
X_val = val_df.drop(columns=['prezzo', 'y_classificazione'])
y_val_regression = val_df['prezzo']
y_val_classification = val_df['y_classificazione']
X_test = test_df.drop(columns=['prezzo', 'y_classificazione'])
y_test_regression = test_df['prezzo']
y_test_classification = test_df['y_classificazione']

In [None]:
# normalize only numerical columns using StandardScaler
numerical_cols = ['superficie', 'superficie_squared', 'posti auto', 'stanze', 'mq_per_stanza', 'stanze_totali', 'prezzo_al_mq']
print("Numerical columns to be normalized:")
print(numerical_cols)
scaler = RobustScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
X_train.head()

## Model Training

In [None]:
# train a linear regression model to predict the price
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

reg_model = LinearRegression()

# Apply log transformation to the target variable
y_train_log = np.log(y_train_regression)

# Perform cross-validation on the training set
cv_scores = cross_val_score(reg_model, X_train, y_train_log, cv=5, scoring='neg_mean_squared_error')
# Print cross-validation results
print("Cross-validation MSE scores:", -cv_scores)
print("Mean cross-validation MSE:", -cv_scores.mean())

# Fit the model on the entire training set
reg_model.fit(X_train, y_train_log)

In [None]:
import matplotlib as plt
y_pred_log = reg_model.predict(X_test)
# Convert predictions back to the original scale
y_pred_original = np.exp(y_pred_log)

# calculate mean squared error
from sklearn.metrics import mean_squared_error
# print overall mse
mse = mean_squared_error(y_test_regression, y_pred_original)
print(f"Mean Squared Error on test set: {mse:.2f}")

# calculate abs error based on zone (Zone_Center, Zone_North, Zone_Northeast, Zone_East, Zone_Southeast, Zone_South, Zone_Southwest, Zone_West, Zone_Northwest)
zones = [col for col in X_test.columns if col.startswith('Zone_')]
for zone in zones:
    zone_mask = (X_test[zone] == 1)
    # if no samples in this zone, skip
    if zone_mask.sum() == 0:
        continue
    zone_abs_errors = np.abs(y_test_regression[zone_mask] - y_pred_original[zone_mask])
    zone_mse = mean_squared_error(y_test_regression[zone_mask], y_pred_original[zone_mask])
    print(f"Zone: {zone}, absolute error mean: {zone_abs_errors.mean():.2f}, MSE: {zone_mse:.2f}")

# print some predictions vs actual values divided by zone
for zone in zones:
    zone_mask = (X_test[zone] == 1)
    # if no samples in this zone, skip
    if zone_mask.sum() == 0:
        continue
    print(f"Predictions for zone {zone}:")
    for i in range(min(5, zone_mask.sum())):
        idx = np.where(zone_mask)[0][i]
        print(f"Predicted: {y_pred_original[idx]:.2f}, Actual: {y_test_regression.iloc[idx]:.2f}")

# Predicted vs Actual plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test_regression, y_pred_original, alpha=0.7, color='blue')
plt.plot([y_test_regression.min(), y_test_regression.max()], [y_test_regression.min(), y_test_regression.max()], 'r--', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Predicted vs Actual Prices')
plt.show()

## Part 4: Evaluation

In this section, we will evaluate the models using the following metrics:

- Accuracy, Precision, Recall, and F1 Score
- Confusion Matrix
- ROC and AUC (binary and multiclass)
- Training vs. Validation Performance
- Optional: Computational cost and training time

We will also visualize these metrics using appropriate graphics.

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Evaluate regression model
start_time = time.time()
y_pred_regression = reg_model.predict(X_test)
end_time = time.time()

# Metrics
mse = mean_squared_error(y_test_regression, y_pred_regression)
r2 = r2_score(y_test_regression, y_pred_regression)

# Print metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
print(f"Training Time: {end_time - start_time:.2f} seconds")

# Residuals plot
plt.figure(figsize=(8, 6))
residuals = y_test_regression - y_pred_regression
sns.histplot(residuals, kde=True, bins=30, color='blue')
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

# predicted vs actual (fit to the data)
plt.figure(figsize=(8, 6))
plt.scatter(y_test_regression, y_pred_regression, alpha=0.7, color='green')
plt.plot([y_test_regression.min(), y_test_regression.max()], [y_test_regression.min(), y_test_regression.max()], 'k--', lw=2)
plt.title('Predicted vs Actual Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

# plot feature importance (first 20 features) based on absolute value of coefficients
feature_importance = np.abs(reg_model.coef_)
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False).head(20)
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
# average error based on zona 
zones = [col for col in X_test.columns if col.startswith('Zone_')]
zone_errors_data = []
for zone in zones:
    zone_mask = (X_test[zone] == 1)
    # if no samples in this zone, skip
    if zone_mask.sum() == 0:
        continue
    zone_errors = np.abs(y_test_regression[zone_mask] - y_pred_original[zone_mask])
    avg_error_euro = zone_errors.mean()
    zone_errors_data.append({'Zone': zone.replace('Zone_', ''), 'Avg Error (€)': avg_error_euro, 'Samples': zone_mask.sum()})

# Create DataFrame
zone_errors_df = pd.DataFrame(zone_errors_data)

# Print table
print("Average Error in Euro by Zone:")
print("-" * 50)
print(zone_errors_df.to_string(index=False))

# Create bar plot
plt.figure(figsize=(12, 6))
bars = plt.bar(zone_errors_df['Zone'], zone_errors_df['Avg Error (€)'], color='steelblue', alpha=0.8)
plt.xlabel('Zone', fontsize=12)
plt.ylabel('Average Error (€)', fontsize=12)
plt.title('Average Prediction Error by Zone', fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'€{height:.0f}',
            ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()


### Classification Model Evaluation

#### Naive Bayes Classifier Evaluation

In [None]:
from sklearn.naive_bayes import GaussianNB  
from sklearn.model_selection import cross_val_score
nb_model = GaussianNB()
# Perform cross-validation on the training set
cv_scores_nb = cross_val_score(nb_model, X_train, y_train_classification, cv=5, scoring='accuracy')
# Print cross-validation results
print("Naive Bayes Cross-validation accuracy scores:", cv_scores_nb)
print("Mean cross-validation accuracy:", cv_scores_nb.mean())

In [None]:
# Now fit and predict with Naive Bayes Classifier
nb_model.fit(X_train, y_train_classification)
y_pred_classification = nb_model.predict(X_test)
# Evaluate classification model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
accuracy = accuracy_score(y_test_classification, y_pred_classification)
precision = precision_score(y_test_classification, y_pred_classification)
recall = recall_score(y_test_classification, y_pred_classification)
f1 = f1_score(y_test_classification, y_pred_classification)
conf_matrix = confusion_matrix(y_test_classification, y_pred_classification)
print("Naive Bayes Classification Accuracy:", accuracy)
print("Naive Bayes Classification Precision:", precision)
print("Naive Bayes Classification Recall:", recall)
print("Naive Bayes Classification F1 Score:", f1)
print("Naive Bayes Confusion Matrix:\n", conf_matrix)

#### Logistic Regression Classifier Evaluation

In [None]:
# Logistic Regression Classifier Evaluation
from sklearn.linear_model import LogisticRegression
logreg_model = LogisticRegression(max_iter=1000)
# Perform cross-validation on the training set
cv_scores_logreg = cross_val_score(logreg_model, X_train, y_train_classification, cv=5, scoring='accuracy')
# Print cross-validation results
print("Logistic Regression Cross-validation accuracy scores:", cv_scores_logreg)
print("Mean cross-validation accuracy:", cv_scores_logreg.mean())   

In [None]:
# Now fit and prefict with Logistic Regression Classifier
logreg_model.fit(X_train, y_train_classification)
y_pred_logreg = logreg_model.predict(X_test)
# Evaluate classification model
accuracy_logreg = accuracy_score(y_test_classification, y_pred_logreg)
precision_logreg = precision_score(y_test_classification, y_pred_logreg)
recall_logreg = recall_score(y_test_classification, y_pred_logreg)
f1_logreg = f1_score(y_test_classification, y_pred_logreg)
conf_matrix_logreg = confusion_matrix(y_test_classification, y_pred_logreg)
print("Logistic Regression Classification Accuracy:", accuracy_logreg)
print("Logistic Regression Classification Precision:", precision_logreg)
print("Logistic Regression Classification Recall:", recall_logreg)
print("Logistic Regression Classification F1 Score:", f1_logreg)
print("Logistic Regression Confusion Matrix:\n", conf_matrix_logreg)    