In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, make_scorer
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

import shap


In het artikel geven ze aan dat ze met de red wine gewerkt hebben: "we
have chosen redwine data for our study because of its popularity over the
white wine."

In [None]:

# Define the relative path to the CSV file
relative_path = 'wine+quality/winequality-red.csv'

# Read the CSV file using pandas
red_wine_df = pd.read_csv(relative_path, delimiter=';')

In [None]:

# Define the relative path to the CSV file
relative_path = 'wine+quality/winequality-white.csv'

# Read the CSV file using pandas
white_wine_df = pd.read_csv(relative_path, delimiter=';')




In [None]:
# Display the first few rows of the data
print(red_wine_df.head())

In [None]:
len(red_wine_df)

In het artikel geven ze aan dat de dataset 4898 samples bevat: "The dataset contains the records of 4898 random samples of wine manufactured."
Dit komt niet overeen met het aantal samples in de redwine dataset. Dit komt overeen met het aantal samples in de whitewine dataset.

In [None]:
len(white_wine_df)

In [None]:
# Calculate the required statistics
statistics = red_wine_df.describe().T[['mean', 'std', 'min', 'max']]
statistics['median'] = red_wine_df.median()

# Display the statistics
print(statistics)


In [None]:
# Calculate the required statistics
statistics = white_wine_df.describe().T[['mean', 'std', 'min', 'max']]
statistics['median'] = white_wine_df.median()

# Display the statistics
print(statistics)


Ook de statistics van table1 in het article "Descriptive statistics of the variables of the redwine data." komen overeen met de statistics van de white wine.

In [None]:
# Calculate the Pearson correlation coefficient (r) of the predictors with respect to the target variable: quality
correlation_coefficients = white_wine_df.corr()['quality']

# Display the correlation coefficients
print(correlation_coefficients)


In [None]:
# Calculate the Pearson correlation coefficient (r) of the predictors with respect to the target variable: quality
correlation_coefficients = red_wine_df.corr()['quality']

# Display the correlation coefficients
print(correlation_coefficients)

Ook de table2 van het artikel komt overeen met de data van whitewine.

De cijfers van table3 zijn volgens mij de cijfers die we moeten reproduceren.

"feature scaling is a very important step one need to take care of, before
training any ML model."
"To scale the features of the dataset, standardization has
been used."


In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()


In [None]:
# Apply standardization to the DataFrame
white_wine_scaled = scaler.fit_transform(white_wine_df)


In [None]:
# Convert the scaled data back to a DataFrame
white_wine_scaled_df = pd.DataFrame(white_wine_scaled, columns=white_wine_df.columns)


In [None]:
print(white_wine_scaled_df.head())

"The data was split into training data set and testing data set in the ratio 3:1."


In [None]:
# Split the data into features and target variable
X = white_wine_scaled_df.drop('quality', axis=1)
y = white_wine_df['quality']


In [None]:
# Split the data into training and testing sets in the ratio 3:1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)


"In this work, we
have used radial basis kernel (RBF) because it outperformed other kernels based
SVR in redwine dataset."

"The optimal values
the parameters computed using 10-fold cross-validation are cost = 0.95 and
gamma = 0.13."

de crosvalidatie kunnen we ook nog nabootsen.

In [None]:
# Initialize and train the SVR model with RBF kernel
svr = SVR(kernel='rbf', C=0.95, gamma=0.13)
svr.fit(X_train, y_train)


In [None]:
# Make predictions for training and testing sets
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)


In [None]:
# Print the lengths and types of the variables
print(f"Length of y_train_pred: {len(y_train_pred)}, Type: {type(y_train_pred)}")
print(f"Length of y_test_pred: {len(y_test_pred)}, Type: {type(y_test_pred)}\n")

# Print the lengths and types of the variables
print(f"Length of X_train: {len(X_train)}, Type: {type(X_train)}")
print(f"Length of X_test: {len(X_test)}, Type: {type(X_test)}\n")




![image.png](attachment:image.png)

Te controleren: is dit echt de Pearson correlation coefficient?

In [None]:
# Function to calculate R (Pearson correlation coefficient), MSE, and MAPE
def calculate_metrics(y_true, y_pred):
    # Calculate R (Pearson correlation coefficient)
    r, _ = pearsonr(y_true, y_pred)

    # Calculate MSE
    mse = mean_squared_error(y_true, y_pred)

    # Calculate MAPE
    mape = mean_absolute_percentage_error(y_true, y_pred)

    return r, mse, mape


In [None]:
# Calculate metrics for training data set
train_r, train_mse, train_mape = calculate_metrics(y_train, y_train_pred)

# Calculate metrics for testing data set
test_r, test_mse, test_mape = calculate_metrics(y_test, y_test_pred)


print(f"Training Data Set Metrics:\nR: {train_r}\nMSE: {train_mse}\nMAPE: {train_mape}")
print(f"Testing Data Set Metrics:\nR: {test_r}\nMSE: {test_mse}\nMAPE: {test_mape}")


![image.png](attachment:image.png)

Analyze the impact of the parameters using SHAP values.

In [None]:
background_data = X_train.sample(100, random_state=1)  # Use a subset of the training data for background
test_data = X_test.sample(20, random_state=1)  # Use a subset of the test data for SHAP values

explainer = shap.KernelExplainer(svr.predict, background_data)

shap_values = explainer.shap_values(test_data)

shap.summary_plot(shap_values, test_data, feature_names=X.columns)

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(20, 15))
ax = ax.flatten()

for i in range(X.shape[1]):
    shap.dependence_plot(i, shap_values, test_data, feature_names=X.columns, ax=ax[i], show=False)

for j in range(X.shape[1], len(ax)):
    fig.delaxes(ax[j])

plt.tight_layout()
plt.show()


In [None]:
shap.initjs()
shap.plots.force(explainer.expected_value, shap_values, feature_names=X.columns)