In [None]:
!pip install tensorflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing
import joblib
import shap
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Price'] = data.target

# Exploratory Data Analysis
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()

sns.pairplot(df[['MedInc', 'HouseAge', 'AveRooms', 'AveOccup', 'Price']])
plt.show()

# Data Preprocessing
scaler = StandardScaler()
X = df.drop(columns=['Price'])
y = df['Price']
X_scaled = scaler.fit_transform(X)

# Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X_scaled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train_poly, X_test_poly, _, _ = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Model Training & Evaluation
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "ElasticNet Regression": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "Support Vector Regressor": SVR(kernel='rbf', C=100, gamma=0.1),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    "MLP Regressor": MLPRegressor(hidden_layer_sizes=(64,64), activation='relu', solver='adam', max_iter=500, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}
    print(f"{name} - MAE: {mae:.2f}, MSE: {mse:.2f}, RMSE: {rmse:.2f}, R2 Score: {r2:.2f}")

# Deep Learning Model
dnn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])
dnn_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
dnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, verbose=1)

dnn_pred = dnn_model.predict(X_test).flatten()
dnn_r2 = r2_score(y_test, dnn_pred)
print(f"Deep Neural Network R2 Score: {dnn_r2:.2f}")

# Stacking Model
estimators = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('svr', SVR(kernel='rbf', C=100, gamma=0.1))
]
stacking_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
stacking_model.fit(X_train, y_train)
stk_pred = stacking_model.predict(X_test)
stk_r2 = r2_score(y_test, stk_pred)
print(f"Stacking Model R2 Score: {stk_r2:.2f}")

# Feature Importance for Random Forest
rf_model = models["Random Forest"]
feature_importances = rf_model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=feature_names, palette='viridis')
plt.title("Feature Importance in Random Forest Model")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# SHAP Explainability for Multiple Models
for name, model in models.items():
    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_train)
        shap.summary_plot(shap_values, X_train, feature_names=feature_names)
    except:
        print(f"SHAP not supported for {name}")

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
random_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2', n_iter=10)
random_search.fit(X_train, y_train)
print("Best Parameters for Random Forest:", random_search.best_params_)

# Save best model
best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model = models[best_model_name]
joblib.dump(best_model, "best_model.pkl")
print(f"Best Model: {best_model_name} with R2 Score: {results[best_model_name]['R2']:.2f}")

# Export results to CSV
results_df = pd.DataFrame(results).T
results_df.to_csv("model_performance.csv", index=True)

# Load and test saved model
loaded_model = joblib.load("best_model.pkl")
y_pred_loaded = loaded_model.predict(X_test)
loaded_r2 = r2_score(y_test, y_pred_loaded)
print(f"Loaded Model R2 Score: {loaded_r2:.2f}")