# Regression with C60.ai

This notebook demonstrates how to use C60.ai for a regression task using the California Housing dataset.

## 1. Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from c60 import AutoML

# Set random seed for reproducibility
np.random.seed(42)

# Set plotting style
sns.set_style('whitegrid')
%matplotlib inline

## 2. Load and Explore the Data

In [None]:
# Load the California Housing dataset
data = fetch_california_housing()
X, y = data.data, data.target
feature_names = data.feature_names

# Create a DataFrame for visualization
df = pd.DataFrame(X, columns=feature_names)
df['MedHouseVal'] = y

# Show the first few rows
print("Dataset shape:", X.shape)
print("\nFeature names:", feature_names)
df.head()

## 3. Data Visualization

In [None]:
# Plot the distribution of the target variable
plt.figure(figsize=(10, 6))
sns.histplot(df['MedHouseVal'], kde=True, bins=50)
plt.title('Distribution of Median House Values')
plt.xlabel('Median House Value (in $100,000s)')
plt.show()

In [None]:
# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

## 4. Prepare the Data

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

## 5. Initialize and Train the AutoML Model

In [None]:
# Initialize AutoML for regression
automl = AutoML(
    task='regression',
    time_budget=120,  # 2 minutes
    metric='neg_mean_squared_error',
    n_jobs=-1,  # Use all available cores
    random_state=42
)

# Train the model
print("Training model...")
automl.fit(X_train, y_train)
print("Training completed!")

## 6. Evaluate the Model

In [None]:
# Make predictions
y_pred = automl.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Evaluation:")
print(f"- MSE: {mse:.4f}")
print(f"- RMSE: {rmse:.4f}")
print(f"- MAE: {mae:.4f}")
print(f"- R²: {r2:.4f}")

## 7. Visualize the Results

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted House Prices')
plt.grid(True)
plt.show()

In [None]:
# Plot residuals
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.residplot(x=y_pred, y=residuals, lowess=True, line_kws=dict(color='r', lw=1))
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True)
plt.show()

## 8. Feature Importance

In [None]:
# Check if the best model has feature importances
if hasattr(automl.best_estimator_, 'feature_importances_'):
    importances = automl.best_estimator_.feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    # Display feature importances
    print("\nFeature Importances:")
    print(feature_importance)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()

## 9. Save the Model

In [None]:
# Save the trained model
automl.save('california_housing_regressor.joblib')
print("Model saved as 'california_housing_regressor.joblib'")

# To load the model later:
# from c60 import AutoML
# automl_loaded = AutoML.load('california_housing_regressor.joblib')
# predictions = automl_loaded.predict(X_test)