In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
data = arff.loadarff('Rice_Cammeo_Osmancik.arff')
df = pd.DataFrame(data[0])
df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,15231.0,525.578979,229.749878,85.093788,0.928882,15617.0,0.572896,b'Cammeo'
1,14656.0,494.311005,206.020065,91.730972,0.895405,15072.0,0.615436,b'Cammeo'
2,14634.0,501.122009,214.106781,87.768288,0.912118,14954.0,0.693259,b'Cammeo'
3,13176.0,458.342987,193.337387,87.448395,0.891861,13368.0,0.640669,b'Cammeo'
4,14688.0,507.166992,211.743378,89.312454,0.906691,15262.0,0.646024,b'Cammeo'


In [None]:
# Decode the byte string in the 'Class' column
df['Class'] = df['Class'].apply(lambda x: x.decode('utf-8'))

# Encode the class labels manually
class_labels = df['Class'].unique()
class_map = {label: idx for idx, label in enumerate(class_labels)}
df['Class'] = df['Class'].map(class_map)

# Features and labels
X = df.drop('Class', axis=1).values
y = df['Class'].values

# Split the data into training and testing sets
def train_test_split(X, y, test_size=0.2):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    test_size = int(X.shape[0] * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Decision Tree Classifier from scratch
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(
            gini=self._gini(y),
            num_samples=len(y),
            num_samples_per_class=num_samples_per_class,
            predicted_class=predicted_class,
        )

        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _gini(self, y):
        m = len(y)
        return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in np.unique(y))

    def _best_split(self, X, y):
        m, n = X.shape
        if m <= 1:
            return None, None

        num_parent = [np.sum(y == c) for c in np.unique(y)]
        best_gini = 1.0 - sum((num / m) ** 2 for num in num_parent)
        best_idx, best_thr = None, None

        for idx in range(n):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * len(np.unique(y))
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in np.unique(y))
                gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in np.unique(y))
                gini = (i * gini_left + (m - i) * gini_right) / m

                if thresholds[i] == thresholds[i - 1]:
                    continue

                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr

    def _predict(self, inputs):
        node = self.tree
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

class Node:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

# Initialize and train the classifier
clf = DecisionTree(max_depth=3)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
def accuracy_score(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def classification_report(y_true, y_pred, class_labels):
    report = ''
    for class_label in class_labels:
        tp = np.sum((y_true == class_label) & (y_pred == class_label))
        fp = np.sum((y_true != class_label) & (y_pred == class_label))
        fn = np.sum((y_true == class_label) & (y_pred != class_label))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        report += f'Class {class_label}:\n'
        report += f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}\n'
    return report

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, np.unique(y))

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# New set of values for prediction
new_data = np.array([
    [15231.0, 525.578979, 229.749878, 85.093788, 0.928882, 15617.0, 0.572896]
])

# Predict class for new data
new_predictions = clf.predict(new_data)

# Decode the predictions to original class names
new_predictions_decoded = [class_labels[pred] for pred in new_predictions]

print('Predictions for new data:', new_predictions_decoded)


Accuracy: 0.9278215223097113
Classification Report:
Class 0:
Precision: 0.91, Recall: 0.92, F1-Score: 0.91
Class 1:
Precision: 0.94, Recall: 0.94, F1-Score: 0.94

Predictions for new data: ['Cammeo']


In [2]:
# Decode the byte string in the 'Class' column
df['Class'] = df['Class'].apply(lambda x: x.decode('utf-8'))

# Encode the class labels manually
class_labels = df['Class'].unique()
class_map = {label: idx for idx, label in enumerate(class_labels)}
df['Class'] = df['Class'].map(class_map)

# Features and labels
X = df.drop('Class', axis=1).values
y = df['Class'].values

# Split the data into training and testing sets
def train_test_split(X, y, test_size=0.2):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    test_size = int(X.shape[0] * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# k-NN Classifier from scratch
class KNearestNeighbors:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [self._euclidean_distance(x, x_train) for x_train in self.X_train]
        # Sort by distance and get the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Return the most common class label
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

    def _euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

# Initialize and train the classifier
knn = KNearestNeighbors(k=3)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the classifier
def accuracy_score(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def classification_report(y_true, y_pred, class_labels):
    report = ''
    for class_label in class_labels:
        tp = np.sum((y_true == class_label) & (y_pred == class_label))
        fp = np.sum((y_true != class_label) & (y_pred == class_label))
        fn = np.sum((y_true == class_label) & (y_pred != class_label))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        report += f'Class {class_label}:\n'
        report += f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}\n'
    return report

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, np.unique(y))

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# New set of values for prediction
new_data = np.array([
    [15231.0, 525.578979, 229.749878, 85.093788, 0.928882, 15617.0, 0.572896]
])

# Predict class for new data
new_predictions = knn.predict(new_data)

# Decode the predictions to original class names
new_predictions_decoded = [class_labels[pred] for pred in new_predictions]

print('Predictions for new data:', new_predictions_decoded)


Accuracy: 0.889763779527559
Classification Report:
Class 0:
Precision: 0.88, Recall: 0.83, F1-Score: 0.86
Class 1:
Precision: 0.89, Recall: 0.93, F1-Score: 0.91

Predictions for new data: ['Cammeo']
