# Application of classical machine learning and MLP methods to predict the flavor of wine by its chemical composition
In this section, classical machine learning was applied to predict the flavor of wine from its chemical composition. Several algorithms were used, including decision trees, boosting methods, and MLP, as these approaches are most effective on tabular data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, accuracy_score, f1_score

## 1. Data loading and processing
In order to use the obtained chemical composition matrices for classification problems using classical ML methods, it is necessary to "flatten" the matrix into a vector

In [3]:
# Loading matrices and target lists
X_array = np.load('X_array.npy')
Y_array = np.load('Y_array.npy')
X_array.shape

(449, 44, 100)

In [6]:
# Reshaping X_array
X_flattened = X_array.reshape(449, 44 * 100)
# Number of target parameters
num_targets = Y_array.shape[1]
target_names = ['Herbs and spices', 'Tobacco/Smoke', 'Wood', 'Berries', 'Citrus',
                'Fruits', 'Nuts', 'Coffee', 'Chocolate/Cacao', 'Flowers']

# Creating lists to store results
results = {
    'Target': [],
    'Model': [],
    'Train Accuracy': [],
    'Train F1-Score': [],
    'CV Accuracy': [],
    'CV Accuracy Std': [],  
    'CV F1-Score': [],
    'CV F1-Score Std': [],  
    'Test Accuracy': [],
    'Test F1-Score': []
}

# Function for cross-validation and model evaluation
def cross_validate_and_evaluate(model_name, model, X_train, y_train, X_test, y_test):
    # Train model
    model.fit(X_train, y_train)
    
    # Train set metrics
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    
    # Cross-validation metrics
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scorer = make_scorer(accuracy_score)
    f1_scorer = make_scorer(f1_score, average='weighted')
    
    cv_accuracies = cross_val_score(model, X_train, y_train, cv=kf, scoring=accuracy_scorer)
    cv_f1_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=f1_scorer)
    
    cv_acc_mean = cv_accuracies.mean()
    cv_acc_std  = cv_accuracies.std(ddof=1)  # sample std
    cv_f1_mean  = cv_f1_scores.mean()
    cv_f1_std   = cv_f1_scores.std(ddof=1)

    # Test (один сплит → std не существует на уровне одной задачи)
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')

    return (train_accuracy, train_f1,
            cv_acc_mean, cv_acc_std, cv_f1_mean, cv_f1_std,
            test_accuracy, test_f1)

# Main loop for training and evaluating models
for i in range(num_targets):
    # Select the i-th target parameter
    y = Y_array[:, i]

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_flattened, y, test_size=0.2, random_state=42)

    # Models for training
    models = {
        'RandomForest': RandomForestClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
        'CatBoost': CatBoostClassifier(random_state=42, silent=True),
        'MLP': MLPClassifier(random_state=42)
    }
    
    # Train and evaluate each model
for model_name, model in models.items():
        (train_acc, train_f1,
         cv_acc_mean, cv_acc_std, cv_f1_mean, cv_f1_std,
         test_acc, test_f1) = cross_validate_and_evaluate(model_name, model, X_train, y_train, X_test, y_test)

        results['Target'].append(target_names[i])
        results['Model'].append(model_name)
        results['Train Accuracy'].append(train_acc)
        results['Train F1-Score'].append(train_f1)
        results['CV Accuracy'].append(cv_acc_mean)
        results['CV Accuracy Std'].append(cv_acc_std)      # сохраняем std по фолдам
        results['CV F1-Score'].append(cv_f1_mean)
        results['CV F1-Score Std'].append(cv_f1_std)       # сохраняем std по фолдам
        results['Test Accuracy'].append(test_acc)
        results['Test F1-Score'].append(test_f1)

        print(f"Target {target_names[i]} - {model_name} | "
              f"Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f} | "
              f"CV Acc: {cv_acc_mean:.4f} ± {cv_acc_std:.4f}, CV F1: {cv_f1_mean:.4f} ± {cv_f1_std:.4f} | "
              f"Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")

# Create DataFrame with results
results_df = pd.DataFrame(results)


Target Flowers - RandomForest | Train Acc: 1.0000, Train F1: 1.0000 | CV Acc: 0.7495 ± 0.0556, CV F1: 0.7370 ± 0.0650 | Test Acc: 0.6333, Test F1: 0.6347


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Target Flowers - XGBoost | Train Acc: 1.0000, Train F1: 1.0000 | CV Acc: 0.7078 ± 0.0935, CV F1: 0.7048 ± 0.0918 | Test Acc: 0.6556, Test F1: 0.6542
Target Flowers - CatBoost | Train Acc: 0.9387, Train F1: 0.9373 | CV Acc: 0.7579 ± 0.0602, CV F1: 0.7463 ± 0.0689 | Test Acc: 0.6556, Test F1: 0.6542




Target Flowers - MLP | Train Acc: 0.7604, Train F1: 0.7352 | CV Acc: 0.7106 ± 0.0721, CV F1: 0.6813 ± 0.0896 | Test Acc: 0.6778, Test F1: 0.6510




In [None]:
summary = (
    results_df
      .groupby('Model', as_index=False)
      .agg(
          cv_acc_mean_over_targets = ('CV Accuracy', 'mean'),
          cv_acc_std_over_targets  = ('CV Accuracy', 'std'),
          test_acc_mean_over_targets = ('Test Accuracy', 'mean'),
          test_acc_std_over_targets  = ('Test Accuracy', 'std')
      )
)

summary_new = summary.assign(
    **{
        'CV Accuracy (mean±std)':   summary['cv_acc_mean_over_targets'].map('{:.4f}'.format) +
                                     ' ± ' + summary['cv_acc_std_over_targets'].map('{:.4f}'.format),
        'Test Accuracy (mean±std)': summary['test_acc_mean_over_targets'].map('{:.4f}'.format) +
                                     ' ± ' + summary['test_acc_std_over_targets'].map('{:.4f}'.format),
    }
)[['Model', 'CV Accuracy (mean±std)', 'Test Accuracy (mean±std)']]

# (опционально) сводка по F1
summary_f1 = (
    results_df
      .groupby('Model', as_index=False)
      .agg(
          cv_f1_mean_over_targets   = ('CV F1-Score', 'mean'),
          cv_f1_std_over_targets    = ('CV F1-Score', 'std'),
          test_f1_mean_over_targets = ('Test F1-Score', 'mean'),
          test_f1_std_over_targets  = ('Test F1-Score', 'std')
      )
).assign(
    **{
        'CV F1 (mean±std)':   lambda d: d['cv_f1_mean_over_targets'].map('{:.4f}'.format) +
                                        ' ± ' + d['cv_f1_std_over_targets'].map('{:.4f}'.format),
        'Test F1 (mean±std)': lambda d: d['test_f1_mean_over_targets'].map('{:.4f}'.format) +
                                        ' ± ' + d['test_f1_std_over_targets'].map('{:.4f}'.format),
    }
)[['Model', 'CV F1 (mean±std)', 'Test F1 (mean±std)']]

In [8]:
display(results_df.head())
display(summary_new)
display(summary_f1)
    

Unnamed: 0,Target,Model,Train Accuracy,Train F1-Score,CV Accuracy,CV Accuracy Std,CV F1-Score,CV F1-Score Std,Test Accuracy,Test F1-Score
0,Flowers,RandomForest,1.0,1.0,0.749531,0.055553,0.736966,0.065007,0.633333,0.634676
1,Flowers,XGBoost,1.0,1.0,0.707825,0.093453,0.704843,0.091767,0.655556,0.654179
2,Flowers,CatBoost,0.938719,0.937346,0.757942,0.060193,0.746288,0.068899,0.655556,0.654179
3,Flowers,MLP,0.760446,0.735236,0.710642,0.072118,0.681308,0.089596,0.677778,0.651022


Unnamed: 0,Model,CV Accuracy (mean±std),Test Accuracy (mean±std)
0,CatBoost,0.7579 ± nan,0.6556 ± nan
1,MLP,0.7106 ± nan,0.6778 ± nan
2,RandomForest,0.7495 ± nan,0.6333 ± nan
3,XGBoost,0.7078 ± nan,0.6556 ± nan


Unnamed: 0,Model,CV F1 (mean±std),Test F1 (mean±std)
0,CatBoost,0.7463 ± nan,0.6542 ± nan
1,MLP,0.6813 ± nan,0.6510 ± nan
2,RandomForest,0.7370 ± nan,0.6347 ± nan
3,XGBoost,0.7048 ± nan,0.6542 ± nan
