In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Load the dataset
# Replace 'your_dataset.csv' with your file name
data = pd.read_csv("your_dataset.csv")

# Step 2: Data Exploration
print("Dataset Head:\n", data.head())
print("\nDataset Info:\n")
print(data.info())
print("\nDataset Description:\n", data.describe())

# Step 3: Data Cleaning (Optional, depends on the dataset)
# Example: Removing duplicates or missing values
data = data.dropna()

# Step 4: Selecting Features and Target
# Assume 'Taxes' is the target column
X = data.drop(columns=['Taxes', 'Address'], errors='ignore')  # Exclude 'Taxes' and any non-numeric fields
y = data['Taxes']

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate the Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R2 Score: {r2}")

# Step 9: Visualize Predictions vs Actual
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Taxes")
plt.ylabel("Predicted Taxes")
plt.title("Actual vs Predicted Taxes")
plt.show()

# Step 10: Feature Importance
feature_importances = model.feature_importances_
features_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
features_df = features_df.sort_values(by="Importance", ascending=False)

print("\nFeature Importances:\n", features_df)

# Plot Feature Importances
plt.figure(figsize=(8, 6))
sns.barplot(x='Importance', y='Feature', data=features_df)
plt.title("Feature Importances")
plt.show()
