# Machine Learning Assignment
Στο συγγεκριμένο notebook βρίσκονται οι ασκήσεις του μαθήματος "Μηχανική Μάθηση" της χρονιάς 2024-2025.

Το dataset που χρησιμοποίησα είναι το Wine Quality Dataset του πανεπιστημίου UC Irvine (https://archive.ics.uci.edu/dataset/186/wine+quality). Πρόκειται για ένα dataset με 11 features και 1 label, την ποιότητα του κρασιού, που είναι ένα σκορ από 0 εώς 10. Για τους σκοπούς της άσκησης, θα υποθέσουμε ότι το κάθε σκορ μπορεί να ανήκει σε μία από 5 κατηγορίες:

*   Κάκιστο (Very Bad. Score: 0-2)
*   Κακό (Bad. Score: 2-4)
*   Εντάξει (Alright. Score 4-6)
*   Καλό (Good. Score 6-8)
*   Άριστο (Excellent. Score 8-10)

Τα ερωτήματα είναι τεράστια για ένα notebook, οπότε αποφάσισα να τα χωρίσω ανά 2. Το συγκεκριμένο notebook θα έχει τα παρακάτω ερωτήματα:

7.   Support Vector Machine (SVM)
8.   K Means

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/MLAssignment/Data')

Mounted at /content/drive


# Data Preprocessing

In [None]:
wines = pd.read_csv('winequality-red.csv', sep=';')

features = wines.drop("quality", axis=1)
label = wines["quality"]

X = np.array(features)
y = wines[["quality"]].to_numpy() #For shape (n,1)

# I am aware this method is not good for edge cases, but it is what it is

# Changed the classification from string to int in order do do one-hot vectors easier later

def classify(y):
    if y < 2.0:
        y = 0
    elif y < 4.0:
        y = 1
    elif y < 6.0:
        y = 2
    elif y < 8.0:
        y = 3
    else:
        y = 4
    return y

num_classes = 5
y_classified = np.array([classify(label) for label in y])

y_one_hot = np.eye(y_classified.max() + 1)[y_classified] #transform the label to a one-hot vector

In [None]:
def normalize_features(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X_normalized = (X - mean) / std
    return X_normalized

X_normalized = normalize_features(X)

# 7) Support Vector Machine (SVM)

In [None]:
class SVM:
    def __init__(self, learning_rate=0.001, _l=0.01, n_iters=1000, class_pred=3):
        self.lr = learning_rate
        self._l = _l
        self.n_iters = n_iters
        self.class_pred = class_pred
        self.w = None
        self.b = None

    def fit(self, X, y_classified): # specified y_classified since it won't work with normal y or one_hot y
        n_samples, n_features = X.shape
        _y = np.where(y_classified != self.class_pred, -1, 1)

        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = _y[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self._l * self.w)
                else:
                    self.w -= self.lr * (2 * self._l * self.w - np.dot(x_i, _y[idx]))
                    self.b -= self.lr * _y[idx]

    def predict (self, X):
        approx = np.dot(X, self.w) - self.b
        return np.sign(approx)

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_classified, test_size=0.2, random_state=42)

In [None]:
# REPLACE CLASS TEST WITH THE CLASS YOU WANT TO PREDICT

class_test = 3
test = SVM(class_pred=class_test)

In [None]:
test.fit(X_train, y_train)

In [None]:
true_positives = np.sum(np.where(y_test == 3, 1, 0), dtype=np.float32)
test_positives = np.sum(np.where(test.predict(X_test) == 1, 1, 0))
print (f"Accuracy = {test_positives / true_positives}")

Accuracy = 0.8908045977011494


# 8) K-Means

In [None]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1-x2)**2))

class KMeans:

    def __init__(self, K=5, max_iters=100, plot_steps=False):
        self.K = K
        self.max_iters = max_iters
        self.plot_steps = plot_steps

        # list of sample indices for each cluster
        self.clusters = [[] for _ in range(self.K)]

        # the centers (mean vector) for each cluster
        self.centroids = []


    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape

        # initialize
        random_sample_idxs = np.random.choice(self.n_samples, self.K, replace=False)
        self.centroids = [self.X[idx] for idx in random_sample_idxs]

        # optimize clusters
        for _ in range(self.max_iters):
            # assign samples to closest centroids (create clusters)
            self.clusters = self._create_clusters(self.centroids)

            if self.plot_steps:
                self.plot()

            # calculate new centroids from the clusters
            centroids_old = self.centroids
            self.centroids = self._get_centroids(self.clusters)

            if self._is_converged(centroids_old, self.centroids):
                break

            if self.plot_steps:
                self.plot()

        # classify samples as the index of their clusters
        return self._get_cluster_labels(self.clusters)

    def _get_cluster_labels(self, clusters):
        # each sample will get the label of the cluster it was assigned to
        labels = np.empty(self.n_samples)
        for cluster_idx, cluster in enumerate(clusters):
            for sample_idx in cluster:
                labels[sample_idx] = cluster_idx

        return labels


    def _create_clusters(self, centroids):
        # assign the samples to the closest centroids
        clusters = [[] for _ in range(self.K)]
        for idx, sample in enumerate(self.X):
            centroid_idx = self._closest_centroid(sample, centroids)
            clusters[centroid_idx].append(idx)
        return clusters

    def _closest_centroid(self, sample, centroids):
        # distance of the current sample to each centroid
        distances = [euclidean_distance(sample, point) for point in centroids]
        closest_idx = np.argmin(distances)
        return closest_idx


    def _get_centroids(self, clusters):
        # assign mean value of clusters to centroids
        centroids = np.zeros((self.K, self.n_features))
        for cluster_idx, cluster in enumerate(clusters):
            cluster_mean = np.mean(self.X[cluster], axis=0)
            centroids[cluster_idx] = cluster_mean
        return centroids

    def _is_converged(self, centroids_old, centroids):
        # distances between old and new centroids, for all centroids
        distances = [euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K)]
        return sum(distances) == 0

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_classified, test_size=0.2, random_state=42)

In [None]:
test = KMeans()
test.predict(X)

array([4., 2., 2., ..., 4., 4., 4.])