In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,
    mean_squared_error, r2_score,
    silhouette_score
)
import numpy as np
import pandas as pd
from collections import Counter

# Load dataset
data = load_wine()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# ----------- 1. Classification (Supervised Learning) -----------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("=== Classification ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ----------- 2. Regression (Supervised Learning) -----------
# Predict 'malic_acid' using other features
X_reg = X.drop(columns='malic_acid')
y_reg = X['malic_acid']
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

reg = LinearRegression()
reg.fit(X_train_r, y_train_r)
y_pred_r = reg.predict(X_test_r)

print("\n=== Regression ===")
print("MSE:", mean_squared_error(y_test_r, y_pred_r))
print("R² Score:", r2_score(y_test_r, y_pred_r))

# ----------- 3. Clustering (Unsupervised Learning) -----------
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X)

# Silhouette Score
sil_score = silhouette_score(X, cluster_labels)

# Purity calculation
def purity_score(y_true, y_pred):
    contingency_matrix = pd.crosstab(y_true, y_pred)
    return np.sum(np.amax(contingency_matrix.values, axis=0)) / np.sum(contingency_matrix.values)

purity = purity_score(y, cluster_labels)

print("\n=== Clustering ===")
print("Silhouette Score:", sil_score)
print("Purity Score:", purity)


=== Classification ===
Accuracy: 0.9814814814814815
Precision (macro): 0.9848484848484849
Recall (macro): 0.9824561403508771
F1 Score (macro): 0.9832390530064948
Confusion Matrix:
 [[18  1  0]
 [ 0 21  0]
 [ 0  0 14]]

=== Regression ===
MSE: 1.2016123956859275
R² Score: 0.0680653350744389

=== Clustering ===
Silhouette Score: 0.5595823478987213
Purity Score: 0.6853932584269663


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
