In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [None]:
df1=pd.read_csv('dataset.csv')
mask=df1['F']==3
df2=df1[~mask].copy()
ndf = df2.drop(['HOIP entry ID', 'Label',
       'Dielectric constant, total','Dielectric constant, electronic', 'Dielectric constant, ionic', 'Volume of the unit cell (A^3)',
        'Refractive index', 'A SITE DFE', 'B SITE DFE', 'X SITE DFE','F'],axis=1)
X = ndf.drop(['Bandgap, GGA (eV)'],axis=1)
y = ndf['Bandgap, GGA (eV)']
X.rename(columns={'Atomization energy (eV/atom)':'E$_{atomization}$', 'Relative energy1 (eV/atom)':'E$_{relative1}$',
                   'Relative energy2 (eV/atom)':'E$_{relative2}$','Density (g/cm^3)':'Density','rA(Ang)':'r$_A$','rB(Ang)':'r$_B$','rX(Ang)':'r$_X$'},inplace=True)

In [None]:
# Define the dataset (example data split)
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)

# Discretized and Dynamic Hyperparameter Search
def objective(trial):
    # Discretize learning rate and other key hyperparameters
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 1e-4, 1e-3, 1e-2, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3])
    depth = trial.suggest_int('depth', 4, 10, step=1)  # Use steps for more structured exploration
    iterations = trial.suggest_int('iterations', 500, 2000, step=100)
    l2_leaf_reg = trial.suggest_categorical('l2_leaf_reg', [1e-3, 0.01, 0.1, 1, 5, 10])
    subsample = trial.suggest_float('subsample', 0.7, 1.0, step=0.1)
    colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.7, 1.0, step=0.1)
    min_child_samples = trial.suggest_int('min_child_samples', 5, 50)
    
    # Try dynamic adjustments for tree architecture
    grow_policy = trial.suggest_categorical('grow_policy', ['Depthwise', 'Lossguide'])
    
    # Create CatBoost model with discretized hyperparameters
    model = CatBoostRegressor(
        learning_rate=learning_rate,
        depth=depth,
        iterations=iterations,
        l2_leaf_reg=l2_leaf_reg,
        subsample=subsample,
        colsample_bylevel=colsample_bylevel,
        grow_policy=grow_policy,
        min_data_in_leaf=min_child_samples,  # Works well with Lossguide grow policy
        random_state=42,
        silent=True,
        thread_count=36
    )
    
    # Cross-validation using KFold
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error', n_jobs=18)
    mean_score = -scores.mean()
    
    return mean_score

# Create an Optuna study with smarter sampling and pruning
def optimize_model():
    study = optuna.create_study(
        direction='minimize', 
        sampler=optuna.samplers.TPESampler(seed=42),  # TPE for structured sampling
        pruner=optuna.pruners.HyperbandPruner(min_resource=100, max_resource=1000)  # Advanced pruning
    )
    
    # Optimize the study
    study.optimize(objective, n_trials=500, timeout=12000)  # Limiting number of trials
    
    # Get best result and display the results
    print(f'Best MSE: {study.best_value}')
    print(f'Best hyperparameters: {study.best_params}')
    
    # Visualizations (useful for deep dives)
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()

    return study

# Run the optimization
best_study = optimize_model()

# Train the model using best parameters
best_params = best_study.best_params

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CatBoost Regressor
catboost_model = CatBoostRegressor(learning_rate=0.1, depth=6, iterations=1000, random_state=42, silent=True, thread_count=36)
# catboost_model= RandomForestRegressor(random_state=42)
# Fit the model on the training data
catboost_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_catboost = catboost_model.predict(X_test)

# Calculate Mean Squared Error (MSE) and R-squared (R2) score on the test data
mse_catboost = mean_squared_error(y_test, y_pred_catboost)
r2_catboost = r2_score(y_test, y_pred_catboost)

print("CatBoost Mean Squared Error:", mse_catboost)
print("CatBoost R-squared Score:", r2_catboost)


kf = KFold(n_splits=10, shuffle=True, random_state=42)
# Perform cross-validation
cv_scores_catboost = cross_val_score(catboost_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error',n_jobs=18)

# Calculate the average cross-validation score
average_cv_score_catboost = cv_scores_catboost.mean()

print("CatBoost Cross-Validation Scores:", cv_scores_catboost)
print("CatBoost Average Cross-Validation Score:", -average_cv_score_catboost)

# # Extract feature importances
feature_importances_cb = catboost_model.feature_importances_

In [None]:
explainer = shap.TreeExplainer(catboost_model)
shap_values1 = explainer.shap_values(X_test, check_additivity=False)
shap_values_exp = shap.Explanation(
    values=shap_values1,
    base_values=np.repeat(explainer.expected_value, len(X_test)),
    data=X_test.values,
    feature_names=X_test.columns
)
shap.summary_plot(shap_values1, X_test, show=False)
# Create the SHAP summary plot with enhancements
def enhanced_shap_summary_plot(shap_values, features, feature_names=None, plot_type="dot", title=None):
    plt.figure(figsize=(12, 8))  # Adjust the figure size for better readability

    # Generate the SHAP summary plot
    shap.summary_plot(shap_values, features, feature_names=feature_names, plot_type=plot_type, show=False)

    # Customize the title and labels
    if title:
        plt.title(title, fontsize=18, fontweight='bold', pad=20)
    plt.xlabel('SHAP Value (Impact on Model Output)', fontsize=16, fontweight='bold')
    # plt.ylabel('Features', fontsize=14, fontweight='bold')

    
 # Add a bold black line at x=0
    plt.axvline(x=0, color='black', linewidth=1)  # Central line at x=0
    # Customize the tick labels for better readability
    plt.xticks(fontsize=16, fontweight='bold')
    plt.yticks(fontsize=16, fontweight='bold')

    # Add a grid to the plot for better visual structure
    # plt.grid(True, linestyle='--', alpha=0.6)

    # Annotate the plot with additional insights
    # Example: Highlight the top feature with the highest impact
    # if feature_names is not None and len(feature_names) > 0:
    #     top_feature = feature_names[0]
    #     plt.annotate(f'Top Feature: {top_feature}', xy=(0.05, 0.95), xycoords='axes fraction', fontsize=14, color='darkblue', fontweight='bold')

    # Customize the color bar (if relevant)
    cbar = plt.gcf().axes[-1]
    cbar.tick_params(labelsize=16)
    cbar.set_ylabel('Feature Value', fontsize=16, fontweight='bold')

    
    # Find the "Low" and "High" labels and make them bold
    for text in cbar.get_yticklabels():
        text.set_fontweight('bold')
        
    # # Adjust the labels directly for "Low" and "High"
    # for label in cbar.get_children():
    #     if isinstance(label, mpl.text.Text):
    #         label.set_fontsize(14)
    #         label.set_fontweight('bold')

    # Save the enhanced plot
    plt.tight_layout()
    plt.savefig('Bg_shap_summary_plot.pdf', format='pdf', dpi=2000, bbox_inches='tight', transparent=True)

    # Show the plot
    plt.show()

# Example usage
# Assuming shap_values1 and X_test are already defined
enhanced_shap_summary_plot(shap_values1, X_test, feature_names=X_test.columns.tolist(), title='Bandgap')

In [None]:
# Assuming feature_importances_rf and X.columns are already defined
feature_names = X.columns

# Sort features by importance
sorted_indices = np.argsort(feature_importances_cb)
sorted_feature_names = [feature_names[i] for i in sorted_indices]
sorted_importances = [feature_importances_cb[i] for i in sorted_indices]

# Select the top 10 features
top_10_feature_names = sorted_feature_names[-20:]
top_10_importances = sorted_importances[-20:]

# Create a figure and axis with adjusted margins
plt.figure(figsize=(10, 8))
plt.subplots_adjust(left=0.3, right=0.9)  # Adjusted margins: more space on the left and right

# Use a color map for color gradients
colors = plt.cm.viridis(np.linspace(0, 1, len(top_10_feature_names)))

# Create the horizontal bar plot
plt.barh(top_10_feature_names, top_10_importances, color=colors)

# Display importance values on the bars
for index, value in enumerate(top_10_importances):
    plt.text(value * 0.96, index, f"{value:.4f}", va='center', ha='left', color='black', fontsize=10, fontweight='bold')

# Customize plot labels and aesthetics
# plt.xlabel("Feature Importance", fontsize=16, fontweight='bold')
# plt.ylabel("Feature", fontsize=16, fontweight='bold')
plt.title("Bandgap", fontsize=18, fontweight='bold', pad=15)
plt.xticks(fontsize=12, fontweight='bold')
plt.yticks(rotation=0, fontsize=12, fontweight='bold')  # No rotation for feature names

# Add grid lines, but make them subtle
# plt.grid(axis='x', linestyle='--', alpha=0.7)

# Adjust the layout to ensure everything fits well
plt.tight_layout()

# Show the plot
plt.savefig('Bandgap_feature_importances.pdf', format='pdf', dpi=2000, bbox_inches='tight')
plt.show()

In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, cv, train_sizes=np.linspace(.1, 1.0, 10)):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='r2', n_jobs=-1)

    # Calculate the mean for training and test scores
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    # Set up the figure
    plt.figure(figsize=(10, 7))
    plt.title("Bandgap", fontsize=20, fontweight='bold')
    plt.xlabel("Training Examples", fontsize=16, fontweight='bold')
    plt.ylabel("R² Score", fontsize=16, fontweight='bold')

    # Plot the learning curve
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score", linewidth=2, markersize=8)
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Test score", linewidth=2, markersize=8)

    # Set tick parameters for better readability
    plt.xticks(fontsize=12, fontweight='bold')
    plt.yticks(fontsize=12, fontweight='bold')

    # Add grid for easier interpretation
    # plt.grid(True, linestyle='--', alpha=0.6)

    # Highlight the initial learning phase
    plt.axvline(train_sizes[0], color='b', linestyle='--', linewidth=2, label="Initial Training Size")

    # Add a legend
    plt.legend(loc="best", fontsize=14, frameon=True, fancybox=True)

    # Save the plot as a high-resolution image
    plt.savefig('Bandgap_learning_curve.pdf', format='pdf', dpi=2000, bbox_inches='tight')

    # Show the plot
    plt.show()

# Plot the learning curve for your Random Forest Regressor
plot_learning_curve(catboost_model, X_train, y_train, cv=10)

In [None]:
def plot_parity(y_true, y_pred, title='Parity Plot'):
    plt.figure(figsize=(10, 7))
    # sns.scatterplot(x=y_true, y=y_pred, s=100, alpha=0.6, color='blue', edgecolor='k', linewidth=1)

    plt.scatter(y_test, y_test, color='blue', marker='o', label='Actual Values')

    # Scatter plot for predicted values (red color, 'x' marker)
    plt.scatter(y_test, y_pred_catboost, color='red', marker='x', label='Predicted Values')
    
    # Plot the ideal line (y = x)
    # plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], '--', color='red', linewidth=2)
    
    plt.title(title, fontsize=20, fontweight='bold')
    plt.xlabel('Actual Value', fontsize=16, fontweight='bold')
    plt.ylabel('Predicted Value', fontsize=16, fontweight='bold')
    
    plt.xticks(fontsize=12, fontweight='bold')
    plt.yticks(fontsize=12, fontweight='bold')
    plt.legend(loc="best",fontsize=14)  # Show legend indicating which points correspond to actual and predicted values
    # plt.grid(True, linestyle='--', alpha=0.7)
    plt.savefig('Bandgap_parity_plot.pdf', format='pdf', dpi=2000, bbox_inches='tight')
    plt.show()

# Example usage
plot_parity(y_test, y_pred_catboost, title='Bandap')