In [None]:
%pip install pandas scikit-learn xgboost openpyxl torch matplotlib seaborn

In [None]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats

# Ensure openpyxl is installed
try:
    import openpyxl
except ImportError:
    raise ImportError("Missing optional dependency 'openpyxl'. Install it using: pip install openpyxl")

# Load dataset
file_path = r".\Nix.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Defining color and spectral feature columns
color_columns = ["L*", "a*", "b*", "c", "h", "X", "Z", "sRGB R", "sRGB G", "sRGB B", "C", "M", "Y", "K"]
spectral_columns = [f"R{wavelength} nm" for wavelength in range(400, 710, 10)]

# Selecting features
color_features = df[color_columns]  # Nix color data
spectral_features = df[spectral_columns]  # Nix spectral data
y = df["O.C (%)"]   # Target variable

# Splitting the dataset into 70% training and 30% validation
X_train_color, X_val_color, y_train, y_val = train_test_split(color_features, y, test_size=0.3, random_state=42)
X_train_spectral, X_val_spectral, _, _ = train_test_split(spectral_features, y, test_size=0.3, random_state=42)

# Standardizing the features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)
X_val_color_scaled = scaler_color.transform(X_val_color)

scaler_spectral = StandardScaler()
X_train_spectral_scaled = scaler_spectral.fit_transform(X_train_spectral)
X_val_spectral_scaled = scaler_spectral.transform(X_val_spectral)

# Initialize models
best_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train and evaluate using Color Data
best_model.fit(X_train_color_scaled, y_train)
y_pred_color_val = best_model.predict(X_val_color_scaled)
r2_color_val = r2_score(y_val, y_pred_color_val)

# Train and evaluate using Spectral Data
best_model.fit(X_train_spectral_scaled, y_train)
y_pred_spectral_val = best_model.predict(X_val_spectral_scaled)
r2_spectral_val = r2_score(y_val, y_pred_spectral_val)

# Correlation Analysis
color_corr = color_features.corrwith(y)
spectral_corr = spectral_features.corrwith(y)

# Plot Color Data Correlation
plt.figure(figsize=(8, 5))
color_corr.sort_values().plot(kind='barh', color=plt.cm.Blues(np.linspace(0.3, 1, len(color_corr))))
plt.xlabel("Correlation Coefficient", fontweight='bold')
plt.ylabel("Color Features", fontweight='bold')
plt.title("Correlation between Color Data and OC", fontweight='bold')
plt.savefig("color_oc_correlation.png", dpi=300, bbox_inches='tight')
plt.show()

# Plot Spectral Data Correlation
plt.figure(figsize=(8, 5))
spectral_corr.sort_values().plot(kind='barh', color=plt.cm.Blues(np.linspace(0.3, 1, len(spectral_corr))))
plt.xlabel("Correlation Coefficient", fontweight='bold')
plt.ylabel("Spectral Features", fontweight='bold')
plt.title("Correlation between Spectral Data and OC", fontweight='bold')
plt.savefig("spectral_oc_correlation.png", dpi=300, bbox_inches='tight')
plt.show()

# Plot OC Distribution with Smooth Curve
plt.figure(figsize=(8, 5))
plt.hist(y, bins=20, color='darkgreen', edgecolor='black', alpha=0.7)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.gaussian_kde(y)(x)
plt.plot(x, p * len(y) * (xmax - xmin) / 20, color='black', linewidth=2)
plt.xlabel("OC (%)", fontweight='bold')
plt.ylabel("Frequency", fontweight='bold')
plt.title("Distribution of OC", fontweight='bold')
plt.savefig("oc_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

# Prediction Plots
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Color Data Plot
axes[0].scatter(y_val, y_pred_color_val, color='red', marker='^', s=100, label='Validation Samples')
axes[0].plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], linestyle='--', color='blue', label='1:1 Line')
axes[0].plot(np.unique(y_val), np.poly1d(np.polyfit(y_val, y_pred_color_val, 1))(np.unique(y_val)), color='black', label='Regression Line')
axes[0].set_xlabel("Measured OC (%)", fontweight='bold')
axes[0].set_ylabel("Predicted OC (%)", fontweight='bold')
axes[0].set_title("Color Data Prediction", fontweight='bold')
axes[0].text(min(y_val), max(y_pred_color_val), f'R² = {r2_color_val:.2f}', fontsize=12, fontweight='bold')
axes[0].legend()

# Spectral Data Plot
axes[1].scatter(y_val, y_pred_spectral_val, color='red', marker='^', s=100, label='Validation Samples')
axes[1].plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], linestyle='--', color='blue', label='1:1 Line')
axes[1].plot(np.unique(y_val), np.poly1d(np.polyfit(y_val, y_pred_spectral_val, 1))(np.unique(y_val)), color='black', label='Regression Line')
axes[1].set_xlabel("Measured OC (%)", fontweight='bold')
axes[1].set_ylabel("Predicted OC (%)", fontweight='bold')
axes[1].set_title("Spectral Data Prediction", fontweight='bold')
axes[1].text(min(y_val), max(y_pred_spectral_val), f'R² = {r2_spectral_val:.2f}', fontsize=12, fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.savefig("prediction_plots.png", dpi=300, bbox_inches='tight')
plt.show()
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats

# Ensure openpyxl is installed
try:
    import openpyxl
except ImportError:
    raise ImportError("Missing optional dependency 'openpyxl'. Install it using: pip install openpyxl")

# Load dataset
file_path = r".\Nix.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Defining color and spectral feature columns
color_columns = ["L*", "a*", "b*", "c", "h", "X", "Z", "sRGB R", "sRGB G", "sRGB B", "C", "M", "Y", "K"]
spectral_columns = [f"R{wavelength} nm" for wavelength in range(400, 710, 10)]

# Selecting features
color_features = df[color_columns]  # Nix color data
spectral_features = df[spectral_columns]  # Nix spectral data
y = df["O.C (%)"]   # Target variable

# Splitting the dataset into 70% training and 30% validation
X_train_color, X_val_color, y_train, y_val = train_test_split(color_features, y, test_size=0.3, random_state=42)
X_train_spectral, X_val_spectral, _, _ = train_test_split(spectral_features, y, test_size=0.3, random_state=42)

# Standardizing the features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)
X_val_color_scaled = scaler_color.transform(X_val_color)

scaler_spectral = StandardScaler()
X_train_spectral_scaled = scaler_spectral.fit_transform(X_train_spectral)
X_val_spectral_scaled = scaler_spectral.transform(X_val_spectral)

# Initialize models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42),
    "Neural Network": MLPRegressor(hidden_layer_sizes=(100,50), max_iter=1000, random_state=42)
}

# Store results
results = {}

for model_name, model in models.items():
    # Train and evaluate using Color Data
    model.fit(X_train_color_scaled, y_train)
    y_pred_color_train = model.predict(X_train_color_scaled)
    y_pred_color_val = model.predict(X_val_color_scaled)
    r2_color_train = r2_score(y_train, y_pred_color_train)
    rmse_color_train = np.sqrt(mean_squared_error(y_train, y_pred_color_train))
    bias_color_train = np.mean(y_pred_color_train - y_train)
    r2_color_val = r2_score(y_val, y_pred_color_val)
    rmse_color_val = np.sqrt(mean_squared_error(y_val, y_pred_color_val))
    bias_color_val = np.mean(y_pred_color_val - y_val)
    
    # Train and evaluate using Spectral Data
    model.fit(X_train_spectral_scaled, y_train)
    y_pred_spectral_train = model.predict(X_train_spectral_scaled)
    y_pred_spectral_val = model.predict(X_val_spectral_scaled)
    r2_spectral_train = r2_score(y_train, y_pred_spectral_train)
    rmse_spectral_train = np.sqrt(mean_squared_error(y_train, y_pred_spectral_train))
    bias_spectral_train = np.mean(y_pred_spectral_train - y_train)
    r2_spectral_val = r2_score(y_val, y_pred_spectral_val)
    rmse_spectral_val = np.sqrt(mean_squared_error(y_val, y_pred_spectral_val))
    bias_spectral_val = np.mean(y_pred_spectral_val - y_val)
    
    results[model_name] = {
        "Color R² (Train)": r2_color_train, "Color RMSE (Train)": rmse_color_train, "Color Bias (Train)": bias_color_train,
        "Color R² (Val)": r2_color_val, "Color RMSE (Val)": rmse_color_val, "Color Bias (Val)": bias_color_val,
        "Spectral R² (Train)": r2_spectral_train, "Spectral RMSE (Train)": rmse_spectral_train, "Spectral Bias (Train)": bias_spectral_train,
        "Spectral R² (Val)": r2_spectral_val, "Spectral RMSE (Val)": rmse_spectral_val, "Spectral Bias (Val)": bias_spectral_val,
    }

# Convert results to DataFrame and display
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df)

# Save results
results_df.to_csv("model_performance.csv", index=True)
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats

# Ensure openpyxl is installed
try:
    import openpyxl
except ImportError:
    raise ImportError("Missing optional dependency 'openpyxl'. Install it using: pip install openpyxl")

# Load dataset
file_path = r".\Nix.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Selecting features
color_features = df.iloc[:, 3:9]  # Nix color data
spectral_features = df.iloc[:, 9:]  # Nix spectral data
y = df["O.C (%)"]   # Target variable

# Splitting the dataset into 70% training and 30% validation
X_train_color, X_val_color, y_train, y_val = train_test_split(color_features, y, test_size=0.3, random_state=42)
X_train_spectral, X_val_spectral, _, _ = train_test_split(spectral_features, y, test_size=0.3, random_state=42)

# Standardizing the features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)
X_val_color_scaled = scaler_color.transform(X_val_color)

scaler_spectral = StandardScaler()
X_train_spectral_scaled = scaler_spectral.fit_transform(X_train_spectral)
X_val_spectral_scaled = scaler_spectral.transform(X_val_spectral)

# Initialize models
best_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train and evaluate using Color Data
best_model.fit(X_train_color_scaled, y_train)
y_pred_color_train = best_model.predict(X_train_color_scaled)
y_pred_color_val = best_model.predict(X_val_color_scaled)

# Train and evaluate using Spectral Data
best_model.fit(X_train_spectral_scaled, y_train)
y_pred_spectral_train = best_model.predict(X_train_spectral_scaled)
y_pred_spectral_val = best_model.predict(X_val_spectral_scaled)

# Save calibration and validation samples with predicted values
df_train = pd.DataFrame(X_train_color, columns=color_features.columns)
df_train["Measured O.C (%)"] = y_train
df_train["Predicted O.C (%)"] = y_pred_color_train
df_train["Type"] = "Calibration"

df_val = pd.DataFrame(X_val_color, columns=color_features.columns)
df_val["Measured O.C (%)"] = y_val
df_val["Predicted O.C (%)"] = y_pred_color_val
df_val["Type"] = "Validation"

df_combined = pd.concat([df_train, df_val])
df_combined.to_csv("calibration_validation_samples.csv", index=False)
print("Calibration and validation samples with predicted values saved to 'calibration_validation_samples.csv'.")
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats

# Ensure openpyxl is installed
try:
    import openpyxl
except ImportError:
    raise ImportError("Missing optional dependency 'openpyxl'. Install it using: pip install openpyxl")

# Load dataset
file_path = r".\Nix.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Defining color feature columns
color_columns = ["L*", "a*", "b*", "c", "h", "X", "Z", "sRGB R", "sRGB G", "sRGB B", "C", "M", "Y", "K"]

# Selecting features
color_features = df[color_columns]  # Nix color data
y = df["O.C (%)"]   # Target variable

# Splitting the dataset into 70% training and 30% validation
X_train_color, X_val_color, y_train, y_val = train_test_split(color_features, y, test_size=0.3, random_state=42)

# Standardizing the features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)
X_val_color_scaled = scaler_color.transform(X_val_color)

# Convert scaled data back to DataFrame
X_train_color_scaled_df = pd.DataFrame(X_train_color_scaled, columns=color_columns, index=X_train_color.index)
X_val_color_scaled_df = pd.DataFrame(X_val_color_scaled, columns=color_columns, index=X_val_color.index)

# Initialize models
best_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train and evaluate using Color Data
best_model.fit(X_train_color_scaled, y_train)
y_pred_color_train = best_model.predict(X_train_color_scaled)
y_pred_color_val = best_model.predict(X_val_color_scaled)

# Save calibration and validation samples with predicted values
df_train = X_train_color.copy()
df_train["Measured O.C (%)"] = y_train
df_train["Predicted O.C (%)"] = y_pred_color_train
df_train["Type"] = "Calibration"

df_val = X_val_color.copy()
df_val["Measured O.C (%)"] = y_val
df_val["Predicted O.C (%)"] = y_pred_color_val
df_val["Type"] = "Validation"

df_combined = pd.concat([df_train, df_val])
import datetime
output_filename = f"calibration_validation_samples_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df_combined.to_csv(output_filename, index=False)
print("Calibration and validation samples with predicted values saved to 'calibration_validation_samples.csv'.")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
file_path = r"./Nix.xlsx"  # Adjust path if needed
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Define feature columns
color_columns = ["L*", "a*", "b*", "c", "h", "X", "Z", "sRGB R", "sRGB G", "sRGB B", "C", "M", "Y", "K"]
y = df["O.C (%)"]

# Split dataset
X_train_color, X_val_color, y_train, y_val = train_test_split(df[color_columns], y, test_size=0.3, random_state=42)

# Standardize training features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)

# Store results
results_gmm = []
synthetic_sizes = [1000, 2000, 3000, 4000, 5000]
oc_thresholds = list(range(4, 15))

for num_samples in synthetic_sizes:
    gmm = GaussianMixture(n_components=5, random_state=42)
    gmm.fit(X_train_color_scaled)
    synthetic_features = gmm.sample(num_samples)[0]
    synthetic_oc = np.random.uniform(y_train.min(), y_train.max(), num_samples)

    for upper_limit in oc_thresholds:
        synthetic_df = pd.DataFrame(synthetic_features, columns=color_columns)
        synthetic_df["O.C (%)"] = synthetic_oc
        synthetic_df = synthetic_df[(synthetic_df["O.C (%)"] >= 3) & (synthetic_df["O.C (%)"] <= upper_limit)]

        # Train Model
        if not synthetic_df.empty:
            # Merge original training data with synthetic data
            train_data = pd.concat([X_train_color, y_train], axis=1).reset_index(drop=True)
            synthetic_df = synthetic_df.reset_index(drop=True)
            train_data = pd.concat([train_data, synthetic_df]).reset_index(drop=True)

            X_train_final = train_data[color_columns]
            y_train_final = train_data["O.C (%)"]

            # ✅ Ensure Validation Features Match the Training Column Order
            X_val_final = X_val_color[color_columns]  # This guarantees same column order

            # Standardize both train and validation sets
            scaler_final = StandardScaler()
            X_train_final_scaled = scaler_final.fit_transform(X_train_final)
            X_val_final_scaled = scaler_final.transform(X_val_final)

            # Train RandomForest model
            rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_model.fit(X_train_final_scaled, y_train_final)
            y_pred = rf_model.predict(X_val_final_scaled)

            # Compute performance metrics
            r2 = r2_score(y_val, y_pred)
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))

            results_gmm.append({
                "Sample Size": num_samples,
                "OC Range": f"3-{upper_limit}%",
                "R²": r2,
                "RMSE": rmse
            })

# Save results
df_gmm = pd.DataFrame(results_gmm)
output_path = "GMM_Results_ALL.xlsx"
df_gmm.to_excel(output_path, index=False)

print(f"✅ GMM results saved successfully: {output_path}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
file_path = "Nix.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Define feature columns
color_columns = ["L*", "a*", "b*", "c", "h", "X", "Z", "sRGB R", "sRGB G", "sRGB B", "C", "M", "Y", "K"]
y = df["O.C (%)"]

# Split dataset (70% training, 30% validation)
X_train_color, X_val_color, y_train, y_val = train_test_split(df[color_columns], y, test_size=0.3, random_state=42)

# Standardize features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)
X_val_color_scaled = scaler_color.transform(X_val_color)

# Initialize KNN model
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_color_scaled, X_train_color_scaled)

# Define synthetic dataset sizes and OC thresholds
synthetic_sizes = [1000, 2000, 3000, 4000, 5000]
oc_thresholds = list(range(4, 15))

# Store results
results_knn = []

for num_samples in synthetic_sizes:
    # Generate synthetic features using KNN
    synthetic_features = knn.predict(X_train_color_scaled[:min(num_samples, len(X_train_color_scaled))])
    synthetic_oc = np.random.uniform(y_train.min(), y_train.max(), len(synthetic_features))

    for upper_limit in oc_thresholds:
        # Create a DataFrame for synthetic data
        synthetic_df = pd.DataFrame(synthetic_features, columns=color_columns)
        synthetic_df["O.C (%)"] = synthetic_oc[:len(synthetic_df)]  # Ensure matching length
        synthetic_df = synthetic_df[(synthetic_df["O.C (%)"] >= 3) & (synthetic_df["O.C (%)"] <= upper_limit)]

        # Train Model
        if not synthetic_df.empty:
            # Combine training data with synthetic data
            train_data = pd.concat([X_train_color, y_train], axis=1)
            train_data = pd.concat([train_data, synthetic_df])

            X_train_final = train_data[color_columns]
            y_train_final = train_data["O.C (%)"]

            # Standardize the combined dataset
            scaler_final = StandardScaler()
            X_train_final_scaled = scaler_final.fit_transform(X_train_final)
            X_val_final_scaled = scaler_final.transform(X_val_color)

            # Train Random Forest Model
            rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_model.fit(X_train_final_scaled, y_train_final)
            y_pred = rf_model.predict(X_val_final_scaled)

            # Calculate performance metrics
            r2 = r2_score(y_val, y_pred)
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))

            # Store results
            results_knn.append({
                "Sample Size": num_samples,
                "OC Range": f"3-{upper_limit}%",
                "R²": r2,
                "RMSE": rmse
            })

# Save results
df_knn = pd.DataFrame(results_knn)
output_file = "KNN_Results_ALL.xlsx"
df_knn.to_excel(output_file, index=False)

print(f"✅ KNN results saved successfully: {output_file}")


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
file_path = "Nix.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Define feature columns
color_columns = ["L*", "a*", "b*", "c", "h", "X", "Z", "sRGB R", "sRGB G", "sRGB B", "C", "M", "Y", "K"]
y = df["O.C (%)"]

# Split dataset (70% training, 30% validation)
X_train_color, X_val_color, y_train, y_val = train_test_split(df[color_columns], y, test_size=0.3, random_state=42)

# Standardize features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)
X_val_color_scaled = scaler_color.transform(X_val_color)

# Define GAN Generator
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim)
        )

    def forward(self, z):
        return self.model(z)

# Define function to generate synthetic GAN data
def generate_gan_data(generator, num_samples, input_dim):
    generator.eval()  # Set to evaluation mode
    z = torch.randn(num_samples, input_dim)
    synthetic_features = generator(z).detach().numpy()
    return scaler_color.inverse_transform(synthetic_features), np.random.uniform(y_train.min(), y_train.max(), num_samples)

# Initialize GAN model
z_dim = 10  # Latent space dimension
generator = Generator(z_dim, X_train_color_scaled.shape[1])

# Define synthetic dataset sizes and OC thresholds
synthetic_sizes = [1000, 2000, 3000, 4000, 5000]
oc_thresholds = list(range(4, 15))

# Store results
results_gan = []

for num_samples in synthetic_sizes:
    # Generate synthetic data using GAN
    gan_features, gan_oc = generate_gan_data(generator, num_samples, z_dim)

    for upper_limit in oc_thresholds:
        # Create a DataFrame for synthetic data
        synthetic_df = pd.DataFrame(gan_features, columns=color_columns)
        synthetic_df["O.C (%)"] = gan_oc[:len(synthetic_df)]  # Ensure matching length
        synthetic_df = synthetic_df[(synthetic_df["O.C (%)"] >= 3) & (synthetic_df["O.C (%)"] <= upper_limit)]

        # Train Model
        if not synthetic_df.empty:
            # Combine training data with synthetic data
            train_data = pd.concat([X_train_color, y_train], axis=1)
            train_data = pd.concat([train_data, synthetic_df])

            X_train_final = train_data[color_columns]
            y_train_final = train_data["O.C (%)"]

            # Standardize the combined dataset
            scaler_final = StandardScaler()
            X_train_final_scaled = scaler_final.fit_transform(X_train_final)
            X_val_final_scaled = scaler_final.transform(X_val_color)

            # Train Random Forest Model
            rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_model.fit(X_train_final_scaled, y_train_final)
            y_pred = rf_model.predict(X_val_final_scaled)

            # Calculate performance metrics
            r2 = r2_score(y_val, y_pred)
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))

            # Store results
            results_gan.append({
                "Sample Size": num_samples,
                "OC Range": f"3-{upper_limit}%",
                "R²": r2,
                "RMSE": rmse
            })

# Save results
df_gan = pd.DataFrame(results_gan)
output_file = "GAN_Results_ALL.xlsx"
df_gan.to_excel(output_file, index=False)

print(f"✅ GAN results saved successfully: {output_file}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
file_path = "Nix.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Define feature columns
color_columns = ["L*", "a*", "b*", "c", "h", "X", "Z", "sRGB R", "sRGB G", "sRGB B", "C", "M", "Y", "K"]
y = df["O.C (%)"]

# Split dataset (70% training, 30% validation)
X_train_color, X_val_color, y_train, y_val = train_test_split(df[color_columns], y, test_size=0.3, random_state=42)

# Standardize features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)
X_val_color_scaled = scaler_color.transform(X_val_color)

# Define synthetic dataset sizes and OC thresholds
synthetic_sizes = [1000, 2000, 3000, 4000, 5000]
oc_thresholds = list(range(4, 15))

# Store results
results_bootstrap = []

for num_samples in synthetic_sizes:
    # Generate synthetic data using Bootstrapping
    bootstrap_features, bootstrap_oc = resample(X_train_color_scaled, y_train.values, 
                                                n_samples=num_samples, random_state=42)

    # Convert bootstrapped features back to original scale
    bootstrap_features = scaler_color.inverse_transform(bootstrap_features)

    for upper_limit in oc_thresholds:
        # Create a DataFrame for synthetic data
        synthetic_df = pd.DataFrame(bootstrap_features, columns=color_columns)
        synthetic_df["O.C (%)"] = bootstrap_oc[:len(synthetic_df)]  # Ensure matching length
        synthetic_df = synthetic_df[(synthetic_df["O.C (%)"] >= 3) & (synthetic_df["O.C (%)"] <= upper_limit)]

        # Train Model
        if not synthetic_df.empty:
            # Combine training data with synthetic data
            train_data = pd.concat([X_train_color, y_train], axis=1)
            train_data = pd.concat([train_data, synthetic_df])

            X_train_final = train_data[color_columns]
            y_train_final = train_data["O.C (%)"]

            # Standardize the combined dataset
            scaler_final = StandardScaler()
            X_train_final_scaled = scaler_final.fit_transform(X_train_final)
            X_val_final_scaled = scaler_final.transform(X_val_color)

            # Train Random Forest Model
            rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_model.fit(X_train_final_scaled, y_train_final)
            y_pred = rf_model.predict(X_val_final_scaled)

            # Calculate performance metrics
            r2 = r2_score(y_val, y_pred)
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))

            # Store results
            results_bootstrap.append({
                "Sample Size": num_samples,
                "OC Range": f"3-{upper_limit}%",
                "R²": r2,
                "RMSE": rmse
            })

# Save results
df_bootstrap = pd.DataFrame(results_bootstrap)
output_file = "Bootstrap_Results_ALL.xlsx"
df_bootstrap.to_excel(output_file, index=False)

print(f"✅ Bootstrap results saved successfully: {output_file}")


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
from sklearn.neural_network import MLPRegressor

# Load dataset
file_path = r"Nix.xlsx"  # Update this path if needed
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Defining color and spectral feature columns
color_columns = ["L*", "a*", "b*", "c", "h", "X", "Z", "sRGB R", "sRGB G", "sRGB B", "C", "M", "Y", "K"]
spectral_columns = [f"R{wavelength} nm" for wavelength in range(400, 710, 10)]

# Selecting features
color_features = df[color_columns]  # Nix color data
spectral_features = df[spectral_columns]  # Nix spectral data
y = df["O.C (%)"]   # Target variable

# Splitting the dataset into 70% training and 30% validation
X_train_color, X_val_color, y_train, y_val = train_test_split(color_features, y, test_size=0.3, random_state=42)
X_train_spectral, X_val_spectral, _, _ = train_test_split(spectral_features, y, test_size=0.3, random_state=42)

# Standardizing the features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)
X_val_color_scaled = scaler_color.transform(X_val_color)

scaler_spectral = StandardScaler()
X_train_spectral_scaled = scaler_spectral.fit_transform(X_train_spectral)
X_val_spectral_scaled = scaler_spectral.transform(X_val_spectral)

# Initialize models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42),
    "Neural Network": MLPRegressor(hidden_layer_sizes=(100,50), max_iter=1000, random_state=42)
}

# Store results
results = {}

for model_name, model in models.items():
    # Train and evaluate using Color Data
    model.fit(X_train_color_scaled, y_train)
    y_pred_color_val = model.predict(X_val_color_scaled)
    
    r2_color_val = r2_score(y_val, y_pred_color_val)
    rmse_color_val = np.sqrt(mean_squared_error(y_val, y_pred_color_val))
    bias_color_val = np.mean(y_pred_color_val - y_val)
    rpiq_color_val = (np.percentile(y_val, 75) - np.percentile(y_val, 25)) / rmse_color_val
    
    # Train and evaluate using Spectral Data
    model.fit(X_train_spectral_scaled, y_train)
    y_pred_spectral_val = model.predict(X_val_spectral_scaled)
    
    r2_spectral_val = r2_score(y_val, y_pred_spectral_val)
    rmse_spectral_val = np.sqrt(mean_squared_error(y_val, y_pred_spectral_val))
    bias_spectral_val = np.mean(y_pred_spectral_val - y_val)
    rpiq_spectral_val = (np.percentile(y_val, 75) - np.percentile(y_val, 25)) / rmse_spectral_val
    
    # Store results
    results[model_name] = {
        "Color R² (Val)": r2_color_val, 
        "Color RMSE (Val)": rmse_color_val, 
        "Color Bias (Val)": bias_color_val,
        "Color RPIQ (Val)": rpiq_color_val,
        "Spectral R² (Val)": r2_spectral_val, 
        "Spectral RMSE (Val)": rmse_spectral_val, 
        "Spectral Bias (Val)": bias_spectral_val,
        "Spectral RPIQ (Val)": rpiq_spectral_val
    }

# Convert results to DataFrame and display
results_df = pd.DataFrame.from_dict(results, orient='index')

# Display results in the console
print(results_df)

# Save results as CSV
results_df.to_csv(r".\validation_performance_metrics.csv", index=True)
print("Validation performance metrics saved as 'validation_performance_metrics.csv'.")


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.mixture import GaussianMixture

# Load dataset
file_path = "Nix.xlsx"  # Update the path if needed
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')

# Define feature columns
color_columns = ["L*", "a*", "b*", "c", "h", "X", "Z", "sRGB R", "sRGB G", "sRGB B", "C", "M", "Y", "K"]
y = df["O.C (%)"]

# Split dataset into training and validation sets
X_train_color, X_val_color, y_train, y_val = train_test_split(df[color_columns], y, test_size=0.3, random_state=42)

# Standardize training features
scaler_color = StandardScaler()
X_train_color_scaled = scaler_color.fit_transform(X_train_color)

# Generate synthetic data using GMM with 5000 samples and OC range 3-7%
num_samples = 5000
oc_range = (3, 7)

gmm = GaussianMixture(n_components=5, random_state=42)
gmm.fit(X_train_color_scaled)
synthetic_features = gmm.sample(num_samples)[0]
synthetic_oc = np.random.uniform(oc_range[0], oc_range[1], num_samples)

# Create synthetic DataFrame and filter based on OC range
synthetic_df = pd.DataFrame(synthetic_features, columns=color_columns)
synthetic_df["O.C (%)"] = synthetic_oc
synthetic_df = synthetic_df[(synthetic_df["O.C (%)"] >= oc_range[0]) & (synthetic_df["O.C (%)"] <= oc_range[1])]

# Merge original training data with synthetic data
train_data = pd.concat([X_train_color, y_train], axis=1).reset_index(drop=True)
synthetic_df = synthetic_df.reset_index(drop=True)
train_data = pd.concat([train_data, synthetic_df]).reset_index(drop=True)

X_train_final = train_data[color_columns]
y_train_final = train_data["O.C (%)"]

# Ensure validation feature columns match training columns
X_val_final = X_val_color[color_columns]  

# Standardize train and validation sets
scaler_final = StandardScaler()
X_train_final_scaled = scaler_final.fit_transform(X_train_final)
X_val_final_scaled = scaler_final.transform(X_val_final)

# Train RandomForest model using best GMM model (5000 samples, OC range 3-7%)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_final_scaled, y_train_final)
y_pred = rf_model.predict(X_val_final_scaled)

# Compute performance metrics
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)


# Create prediction plot (without gridlines)
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_pred, color='red', marker='^', s=100, label='Validation Samples')
plt.plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], linestyle='--', color='blue', label='1:1 Line')
plt.plot(np.unique(y_val), np.poly1d(np.polyfit(y_val, y_pred, 1))(np.unique(y_val)), color='black', label='Regression Line')

plt.xlabel("Measured SOC (%)", fontweight='bold')
plt.ylabel("Predicted SOC (%)", fontweight='bold')
plt.text(min(y_val), max(y_pred), f'R² = {r2:.2f}', fontsize=12, fontweight='bold')
plt.legend()

# Save the plot
plt.savefig("GMM_prediction_plot_updated_no_grid.png", dpi=300, bbox_inches='tight')
plt.show()
