In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

Real


In [19]:


# Load data
file_paths = {
    "Magnesium": "Spectro_Magnesium.xlsx",
    "Nitrogen": "Spectro_Nitrogen.xlsx",
    "Phosphorus": "Spectro_Phosphorus.xlsx",
    "Potassium": "Spectro_Potassium.xlsx"
}
dataframes = {key: pd.read_excel(path) for key, path in file_paths.items()}



In [None]:
# Iterate through nutrients
for nutrient, df in dataframes.items():
    print(f"\nProcessing nutrient: {nutrient}")

    # Prepare data
    target = "Rule"
    features = [col for col in df.columns if col not in ["SAMPLE_CODE", target]]

    X = df[features]
    y = df[target]

    # Convert column names
    X.columns = X.columns.astype(str)

    # Encode target
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Handle imbalance
    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(X, y_encoded)

    # Normalize features
    scaler = StandardScaler()
    X_smote = scaler.fit_transform(X_smote)

    # Generate indices for alignment
    smote_indices = pd.RangeIndex(len(X_smote))

    # Split data
    X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
        X_smote, y_smote, smote_indices, test_size=0.2, random_state=42, stratify=y_smote
    )

    # Align with original dataset
    test_sample_ids = df["SAMPLE_CODE"].iloc[test_indices % len(df)].values
    ground_truth = df[target].iloc[test_indices % len(df)].values

    # Hyperparameter tuning
    param_dist = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.03, 0.05],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }

    model = XGBClassifier(random_state=42, scale_pos_weight=1)
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=30,
        scoring="accuracy",
        cv=StratifiedKFold(n_splits=5),
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    # Train and predict
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    # Decode predictions
    y_test_decoded = label_encoder.inverse_transform(y_test)
    y_pred_decoded = label_encoder.inverse_transform(y_pred)

    # Save results
    comparison_df = pd.DataFrame({
        "Sample_ID": test_sample_ids,
        f"True_{nutrient}": ground_truth,
        f"Predicted_{nutrient}": y_pred_decoded
    })
    output_file_name = f"comparison_{nutrient.lower()}.csv"
    comparison_df.to_csv(output_file_name, index=False)
    print(f"Comparison file saved for {nutrient}: '{output_file_name}'")

    # Evaluate
    print("\nClassification Report:")
    print(classification_report(y_test_decoded, y_pred_decoded))

    print("\nConfusion Matrix:")
    conf_matrix = confusion_matrix(y_test_decoded, y_pred_decoded)
    print(conf_matrix)

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues", 
                xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.title(f"Confusion Matrix for {nutrient}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

print("\nProcessing completed for all nutrients. CSV files saved.")


Magnesium 75%


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
file_path = "Spectro_Magnesium.xlsx"
df = pd.read_excel(file_path)

# Define target and features
target = "Rule"
features = [col for col in df.columns if col not in ["SAMPLE_CODE", target, "MAGNESIUM"]]

X = df[features]
y = df[target]

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Robust scaling for better handling of small datasets and outliers
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Feature selection using ANOVA F-test
k_best_features = 30  # Choose top 30 features
selector = SelectKBest(score_func=f_classif, k=k_best_features)
X_selected = selector.fit_transform(X_scaled, y_encoded)

# Dimensionality reduction using PCA
pca = PCA(n_components=10)  # Reduce to 10 components for simplicity
X_pca = pca.fit_transform(X_selected)

# Address class imbalance with ADASYN
adasyn = ADASYN(random_state=42)
X_balanced, y_balanced = adasyn.fit_resample(X_pca, y_encoded)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

# Define classifiers
xgb_model = XGBClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
gbc_model = GradientBoostingClassifier(random_state=42)

# Hyperparameter optimization for XGBoost using GridSearchCV
param_grid = {
    'n_estimators': [100, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0]
}
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
xgb_best_model = grid_search.best_estimator_

# Ensemble model with stacking
ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_best_model),
        ('rf', rf_model),
        ('gbc', gbc_model)
    ],
    voting='soft'
)
ensemble_model.fit(X_train, y_train)

# Predictions
y_pred = ensemble_model.predict(X_test)

# Decode predictions
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Evaluation
print("\nClassification Report:")
print(classification_report(y_test_decoded, y_pred_decoded))

# Confusion matrix
conf_matrix = confusion_matrix(y_test_decoded, y_pred_decoded)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Save results
results_df = pd.DataFrame({
    "True_Label": y_test_decoded,
    "Predicted_Label": y_pred_decoded
})
results_df.to_csv("magnesium_predictions_optimized.csv", index=False)
print("Results saved to 'magnesium_predictions_optimized.csv'")


Magnesium 88%

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load dataset
file_path = "Spectro_Magnesium.xlsx"
df = pd.read_excel(file_path)

# Define target and features
target = "Rule"
features = [col for col in df.columns if col not in ["SAMPLE_CODE", target, "MAGNESIUM"]]

X = df[features]
y = df[target]

# Handle missing values
X.fillna(X.median(), inplace=True)  # Impute numeric missing values with median
y.fillna(y.mode()[0], inplace=True)  # Impute target missing values with mode

# Outlier treatment
z_scores = np.abs(stats.zscore(X))
X = X[(z_scores < 3).all(axis=1)]  # Remove rows where z-score > 3 for any feature
y = y.loc[X.index]  # Keep target consistent with features

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Robust scaling for better handling of small datasets and outliers
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Generate polynomial features to capture interactions
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X_scaled)
print("Shape after polynomial features:", X_poly.shape)

# Feature selection using SelectKBest and Recursive Feature Elimination
k_best_features = 50  # Choose top 50 features
selector = SelectKBest(score_func=f_classif, k=k_best_features)
X_selected = selector.fit_transform(X_poly, y_encoded)
print("Shape after SelectKBest:", X_selected.shape)

# Dimensionality reduction using PCA to retain meaningful components
pca = PCA(n_components=15)  # Retain more components to capture more variance
X_pca = pca.fit_transform(X_selected)
print("Shape after PCA:", X_pca.shape)

# Address class imbalance with SMOTE followed by ADASYN
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_pca, y_encoded)

adasyn = ADASYN(random_state=42)
X_balanced, y_balanced = adasyn.fit_resample(X_smote, y_smote)
print("Shape after hybrid oversampling:", X_balanced.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

# Define classifiers
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
rf_model = RandomForestClassifier(random_state=42)
gbc_model = GradientBoostingClassifier(random_state=42)
lgbm_model = LGBMClassifier(random_state=42)
catboost_model = CatBoostClassifier(verbose=0, random_state=42)
svc_model = SVC(probability=True, random_state=42)

# Hyperparameter optimization using RandomizedSearchCV for multiple models
param_grid_xgb = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
xgb_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid_xgb,
    n_iter=30,
    scoring="accuracy",
    cv=StratifiedKFold(n_splits=5),
    random_state=42,
    n_jobs=-1
)
xgb_search.fit(X_train, y_train)
xgb_best_model = xgb_search.best_estimator_

# Ensemble model with stacking and weighted voting
ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_best_model),
        ('rf', rf_model),
        ('gbc', gbc_model),
        ('lgbm', lgbm_model),
        ('catboost', catboost_model),
        ('svc', svc_model)
    ],
    voting='soft',
    weights=[4, 2, 2, 2, 3, 1]  # Assign weights based on model strength
)
ensemble_model.fit(X_train, y_train)

# Cross validation to evaluate model robustness
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
cv_scores = []
for train_index, test_index in skf.split(X_balanced, y_balanced):
    X_cv_train, X_cv_test = X_balanced[train_index], X_balanced[test_index]
    y_cv_train, y_cv_test = y_balanced[train_index], y_balanced[test_index]
    ensemble_model.fit(X_cv_train, y_cv_train)
    cv_scores.append(ensemble_model.score(X_cv_test, y_cv_test))

print("Cross Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

# Final evaluation
y_pred = ensemble_model.predict(X_test)

# Decode predictions
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test_decoded, y_pred_decoded))

# Confusion matrix
conf_matrix = confusion_matrix(y_test_decoded, y_pred_decoded)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Save results
results_df = pd.DataFrame({
    "True_Label": y_test_decoded,
    "Predicted_Label": y_pred_decoded
})
results_df.to_csv("magnesium_predictions_optimized.csv", index=False)
print("Results saved to 'magnesium_predictions_optimized.csv'")


Final 88%

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load dataset
file_path = "Spectro_Magnesium.xlsx"
df = pd.read_excel(file_path)

# Define target and features
target = "Rule"
features = [col for col in df.columns if col not in ["SAMPLE_CODE", target, "MAGNESIUM"]]

X = df[features]
y = df[target]
sample_ids = df["SAMPLE_CODE"]  # Include sample IDs for tracking

# Handle missing values
X.fillna(X.median(), inplace=True)  # Impute numeric missing values with median
y.fillna(y.mode()[0], inplace=True)  # Impute target missing values with mode

# Outlier treatment
z_scores = np.abs(stats.zscore(X))
X = X[(z_scores < 3).all(axis=1)]  # Remove rows where z-score > 3 for any feature
y = y.loc[X.index]  # Keep target consistent with features
sample_ids = sample_ids.loc[X.index]  # Keep sample IDs consistent

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Robust scaling for better handling of small datasets and outliers
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Generate polynomial features to capture interactions
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X_scaled)
print("Shape after polynomial features:", X_poly.shape)

# Feature selection using SelectKBest
k_best_features = 50  # Choose top 50 features
selector = SelectKBest(score_func=f_classif, k=k_best_features)
X_selected = selector.fit_transform(X_poly, y_encoded)
print("Shape after SelectKBest:", X_selected.shape)

# Dimensionality reduction using PCA to retain meaningful components
pca = PCA(n_components=15)  # Retain more components to capture more variance
X_pca = pca.fit_transform(X_selected)
print("Shape after PCA:", X_pca.shape)

# Address class imbalance with SMOTE followed by ADASYN
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_pca, y_encoded)
sample_ids_smote = sample_ids.iloc[smote.fit_resample(np.arange(len(sample_ids)).reshape(-1, 1), y_encoded)[0].flatten()]

adasyn = ADASYN(random_state=42)
X_balanced, y_balanced = adasyn.fit_resample(X_smote, y_smote)
sample_ids_balanced = sample_ids_smote.iloc[adasyn.fit_resample(np.arange(len(sample_ids_smote)).reshape(-1, 1), y_smote)[0].flatten()]

print("Shape after hybrid oversampling:", X_balanced.shape)

# Train-test split
X_train, X_test, y_train, y_test, train_ids, test_ids = train_test_split(
    X_balanced, y_balanced, sample_ids_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

# Define classifiers
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
rf_model = RandomForestClassifier(random_state=42)
gbc_model = GradientBoostingClassifier(random_state=42)
lgbm_model = LGBMClassifier(random_state=42)
catboost_model = CatBoostClassifier(verbose=0, random_state=42)
svc_model = SVC(probability=True, random_state=42)

# Hyperparameter optimization using RandomizedSearchCV for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
xgb_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid_xgb,
    n_iter=30,
    scoring="accuracy",
    cv=StratifiedKFold(n_splits=5),
    random_state=42,
    n_jobs=-1
)
xgb_search.fit(X_train, y_train)
xgb_best_model = xgb_search.best_estimator_

# Ensemble model with stacking and weighted voting
ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_best_model),
        ('rf', rf_model),
        ('gbc', gbc_model),
        ('lgbm', lgbm_model),
        ('catboost', catboost_model),
        ('svc', svc_model)
    ],
    voting='soft',
    weights=[4, 2, 2, 2, 3, 1]  # Assign weights based on model strength
)
ensemble_model.fit(X_train, y_train)

# Cross-validation to evaluate model robustness
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
cv_scores = []
for train_index, test_index in skf.split(X_balanced, y_balanced):
    X_cv_train, X_cv_test = X_balanced[train_index], X_balanced[test_index]
    y_cv_train, y_cv_test = y_balanced[train_index], y_balanced[test_index]
    ensemble_model.fit(X_cv_train, y_cv_train)
    cv_scores.append(ensemble_model.score(X_cv_test, y_cv_test))

print("Cross Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

# Final evaluation
y_pred = ensemble_model.predict(X_test)

# Decode predictions
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test_decoded, y_pred_decoded))

# Confusion matrix
conf_matrix = confusion_matrix(y_test_decoded, y_pred_decoded)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Save results with sample IDs included
results_df = pd.DataFrame({
    "Sample_ID": test_ids.values,
    "True_Label": y_test_decoded,
    "Predicted_Label": y_pred_decoded
})
results_df.to_csv("magnesium_predictions_with_ids.csv", index=False)
print("Results saved to 'magnesium_predictions_with_ids.csv'")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)  # Impute numeric missing values with median


Shape after polynomial features: (113, 282377)


  f = msb / msw


Shape after SelectKBest: (113, 50)
Shape after PCA: (113, 15)
Shape after hybrid oversampling: (165, 15)


: 