In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import json
import lightgbm as lgb
import xgboost as xgb
import catboost


from google.colab import drive
drive.mount('/content/drive')
from google.colab import files
uploaded = files.upload()

# Load dataset
df = pd.read_csv('/content/house.csv')

# Define X and Y
X = df.drop(columns=['median_house_value'])  # Independent variables
Y = df['median_house_value']  # Dependent variable

# Split the dataset into training and testing sets (80:20 split, random_state=195)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=195)

# Scale the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the new models
new_models = {
    "Random Forest": RandomForestRegressor(random_state=195),
    "XGBoost": xgb.XGBRegressor(objective="reg:squarederror", random_state=195),
    "CatBoost": catboost.CatBoostRegressor(learning_rate=0.1, iterations=500, depth=6, verbose=0, random_state=195),
    "Light GBM": lgb.LGBMRegressor(random_state=195)
}

# Load Best Parameters from Google Drive (new models)
new_best_params_path = '/content/drive/My Drive/best_params_new.json'
with open(new_best_params_path, "r") as f:
    new_best_params = json.load(f)

# Re-initialize the new models with their tuned parameters
optimized_new_models = {
    "Random Forest": RandomForestRegressor(**new_best_params["Random Forest"], random_state=195),
    "XGBoost": xgb.XGBRegressor(**new_best_params["XGBoost"], objective="reg:squarederror", random_state=195),
    "CatBoost": catboost.CatBoostRegressor(**new_best_params["CatBoost"], verbose=0, random_state=195),
    "Light GBM": lgb.LGBMRegressor(**new_best_params["LightGBM"], random_state=195)
}

# Load Best Parameters for the old models from Google Drive
best_params_path = '/content/drive/My Drive/best_params.json'
with open(best_params_path, "r") as f:
    best_params = json.load(f)

# Re-initialize the old models with their tuned parameters
optimized_old_models = {
    "Linear Regression": LinearRegression(),  # No tuning needed
    "Decision Tree": DecisionTreeRegressor(**best_params["Decision Tree"], random_state=195),
    "Support Vector Machine": SVR(**best_params["Support Vector Machine"]),
    "k-Nearest Neighbors": KNeighborsRegressor(**best_params["k-Nearest Neighbors"])
}

# Update all_models with both old and new models
all_models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=195),
    "Support Vector Machine": SVR(),
    "k-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5)
}
all_models.update(optimized_old_models)
all_models.update(optimized_new_models)

# commented because we already did the process ...
"""
# GridSearchCV to find best parameters for each model
from sklearn.model_selection import GridSearchCV

param_grid = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20]
    },
    'XGBoost': {
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7]
    },
    'CatBoost': {
        'learning_rate': [0.05, 0.1, 0.2],
        'iterations': [200, 500],
        'depth': [4, 6]
    },
    'Light GBM': {
        'learning_rate': [0.05, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 10]
    }
}
"""
# Train and evaluate models
model_metrics = []
for model_name, model in all_models.items():
    model.fit(X_train_scaled, Y_train)
    Y_train_pred = model.predict(X_train_scaled)
    Y_test_pred = model.predict(X_test_scaled)

    r2_train = r2_score(Y_train, Y_train_pred)
    r2_test = r2_score(Y_test, Y_test_pred)
    mse_train = mean_squared_error(Y_train, Y_train_pred)
    mse_test = mean_squared_error(Y_test, Y_test_pred)
    mae_train = mean_absolute_error(Y_train, Y_train_pred)
    mae_test = mean_absolute_error(Y_test, Y_test_pred)

    model_metrics.append([model_name, r2_train, r2_test, mse_train, mse_test, mae_train, mae_test])

# Create a DataFrame for metrics comparison
model_metrics_df = pd.DataFrame(model_metrics, columns=["Model", "R2_Train", "R2_Test", "MSE_Train", "MSE_Test", "MAE_Train", "MAE_Test"])
print(model_metrics_df)

# Create dataframes for each metric for comparison
r2_df = model_metrics_df[["Model", "R2_Train", "R2_Test"]]
mse_df = model_metrics_df[["Model", "MSE_Train", "MSE_Test"]]
mae_df = model_metrics_df[["Model", "MAE_Train", "MAE_Test"]]

# Visualize R2 scores using Plotly bar plots
top_3_models = r2_df.sort_values(by="R2_Test", ascending=False).head(3)
fig = go.Figure(data=[
    go.Bar(name='Train R2', x=top_3_models["Model"], y=top_3_models["R2_Train"]),
    go.Bar(name='Test R2', x=top_3_models["Model"], y=top_3_models["R2_Test"])
])
fig.update_layout(
    title="R2 Scores for Top 3 Models",
    xaxis_title="Model",
    yaxis_title="R2 Score",
    barmode='group'
)
fig.show()

# Parity plots for all models using Plotly
for model_name, model in all_models.items():
    predicted_values = model.predict(X_test_scaled)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=Y_test, y=predicted_values, mode='markers', name=model_name))
    fig.add_trace(go.Scatter(x=[Y_test.min(), Y_test.max()], y=[Y_test.min(), Y_test.max()], mode='lines', name='Perfect Prediction', line=dict(color='red', dash='dash')))
    fig.update_layout(
        title=f'Parity Plot: {model_name}',
        xaxis_title="Actual Values",
        yaxis_title="Predicted Values"
    )
    fig.show()
fig.write_html("r2_scores.html")




MessageError: Error: credential propagation was unsuccessful

In [3]:
!pip install CatBoost

Collecting CatBoost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: CatBoost
Successfully installed CatBoost-1.2.7


In [5]:
import plotly.graph_objects as go
fig.write_html("r2_scores.html")



NameError: name 'fig' is not defined