In [113]:
import numpy as np
import pandas as pd

In [114]:
def split_train_test(df, species, indices=()):
    train_dfs = []
    test_df = pd.DataFrame()
    # randomly pick one sample from each species
    for specie in species:
        specie_df = df[df['Species'] == specie]
        test_sample = specie_df.sample(n=1)
        # test_sample = specie_df.iloc[0]
        test_df = pd.concat([test_df, test_sample])
        train_df = specie_df.drop(test_sample.index)
        train_dfs.append(train_df)
    
    return train_dfs, test_df

In [115]:
def mahalanobis_distance(x, df: pd.DataFrame):
    mean_vec = df.mean().values
    cov_mat = df.cov().values
    inv_cov_mat = np.linalg.inv(cov_mat)
    return np.sqrt((x-mean_vec).T @ inv_cov_mat @ (x-mean_vec))

In [116]:
def mahalanobis_classify(test_df, train_dfs, species):
    y_pred = []
    y_dist = []
    for sample in test_df.values:
        distances = []
        for train_df in train_dfs:
            distances.append(mahalanobis_distance(sample, train_df))
        pred_idx = np.argmin(distances)
        y_pred.append(species[pred_idx])
        y_dist.append(distances[pred_idx])
    return y_pred, y_dist

In [117]:
def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

# Load

In [118]:
iris_df = pd.read_csv('iris.csv').drop('Id', axis=1)
iris_df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


# Split Test-Train

In [119]:
species = iris_df['Species'].unique()
train_dfs, test_df = split_train_test(iris_df, species)
test_df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
37,4.9,3.1,1.5,0.1,Iris-setosa
92,5.8,2.6,4.0,1.2,Iris-versicolor
146,6.3,2.5,5.0,1.9,Iris-virginica


In [120]:
X_train_dfs = [train_df.drop('Species', axis=1) for train_df in train_dfs]
X_test = test_df.drop('Species', axis=1)
y_train_dfs = [train_df['Species'] for train_df in train_dfs]
y_test = test_df['Species']

In [121]:
y_pred, y_dist = mahalanobis_classify(X_test, X_train_dfs, species)
result_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Distance': y_dist})
result_df

Unnamed: 0,Actual,Predicted,Distance
37,Iris-setosa,Iris-setosa,1.732058
92,Iris-versicolor,Iris-versicolor,0.716156
146,Iris-virginica,Iris-virginica,2.111874


# Performance

In [122]:
print('Accuracy:', accuracy(y_test, y_pred))

Accuracy: 1.0
