In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras

sns.set_style('whitegrid')

# Load the dataset
df = pd.read_csv('houses_to_rent_brazil_kaggle.csv')

# Data Preprocessing
df = df[df['floor'] != '-']  # Remove rows with '-' in the 'floor' column
df = pd.get_dummies(df, columns=['city', 'animal', 'furniture'])

# Define your features (X) and target variable (y)
X = df.drop(columns=['total (R$)'])  # Features
y = df['total (R$)']  # Target variable

# Linear Regression
lr = LinearRegression()
lr.fit(X, y)

targets = df['rent amount (R$)']

x_train, x_test, y_train, y_test = train_test_split(df, targets, test_size=0.3, random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42) 
cross_val_scores = cross_val_score(lr, x_train, y_train, cv=kf, scoring='r2')
print("Cross-validation scores:", cross_val_scores)
print("Mean R^2:", cross_val_scores.mean())

# PCA (Principal Component Analysis)
pca = PCA(n_components=11)
df_pca = pca.fit_transform(X)
df_pca = pd.DataFrame(df_pca, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5', 'PCA6', 'PCA7', 'PCA8', 'PCA9', 'PCA10', 'PCA11'])

# Model Comparison
x_train, x_test, y_train, y_test = train_test_split(df_pca, y, test_size=0.3, random_state=42)

# Linear Regression
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print("Linear Regression Mean Squared Error:", mse_lr)
print("Linear Regression R^2 Score:", r2_lr)
# SVM Regressor
svm_regressor = SVR()
svm_regressor.fit(x_train, y_train)
y_pred_svm = svm_regressor.predict(x_test)
mse_svm = mean_squared_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)
print("SVM Mean Squared Error:", mse_svm)
print("SVM R^2 Score:", r2_svm)
# Random Forest Regressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(x_train, y_train)
y_pred_rf = rf_regressor.predict(x_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print("Random Forest Mean Squared Error:", mse_rf)
print("Random Forest R^2 Score:", r2_rf)
# XGBoost Regressor
xgb_regressor = XGBRegressor()
xgb_regressor.fit(x_train, y_train)
y_pred_xgb = xgb_regressor.predict(x_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("XGBoost Mean Squared Error:", mse_xgb)
print("XGBoost R^2 Score:", r2_xgb)
# Ridge Regression
ridge_regressor = Ridge(alpha=1.0)
ridge_regressor.fit(x_train, y_train)
y_pred_ridge = ridge_regressor.predict(x_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print("Ridge Mean Squared Error:", mse_ridge)
print("Ridge R^2 Score:", r2_ridge)

# Example of predicting rent for new data
original_cols = x_train.columns
new_data = pd.DataFrame([[4, 100, 2, 1, 2, 2, 0, 0, 1000, 500, 40]], columns=original_cols)
predicted_rent = lr.predict(new_data)
print("Predicted Rent (Linear Regression):", predicted_rent[0])

# Neural Network Model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

model = keras.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_data=(X_test_scaled, y_test))
mse, mae = model.evaluate(X_test_scaled, y_test)
print(f'Mean Squared Error (Neural Network): {mse}')
print(f'Mean Absolute Error (Neural Network): {mae}')

new_data_scaled = scaler.transform(new_data)
predicted_rent = model.predict(new_data_scaled)
print("Predicted Rent (Neural Network):", predicted_rent[0][0])


Cross-validation scores: [1. 1. 1. 1. 1.]
Mean R^2: 1.0
Linear Regression Mean Squared Error: 59.301799464344285
Linear Regression R^2 Score: 0.9999998980623702
SVM Mean Squared Error: 584303984.1214895
SVM R^2 Score: -0.0043972312900550925
Random Forest Mean Squared Error: 69615355.36308435
Random Forest R^2 Score: 0.880333743937955
Ridge Mean Squared Error: 59.30162466845111
Ridge R^2 Score: 0.9999998980626708
Predicted Rent (Linear Regression): 5543.766125778419
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 