### Data preparation

In [25]:
import pandas as pd
from pathlib import Path
import os
from sklearn.model_selection import train_test_split
import math
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [26]:
def preprocessing(old_csv, new_csv):
    directory = os.path.dirname(new_csv)
    
    def classification(age, wgt):
        if age < 30:
            return 'A' if 50 <= wgt <= 80 else 'B'
        else:
            return 'A' if 55 <= wgt <= 85 else 'B'

    df = pd.read_csv(old_csv)
    df['class label'] = df.apply(lambda row: classification(row['age'], row['wgt']), axis=1)
    
    df.to_csv(new_csv, index=False)
    
    label_a = df[df['class label'] == 'A']
    label_b = df[df['class label'] == 'B']
    
    #50 training data objects
    a_training = label_a.sample(n=25, random_state=42)
    b_training = label_b.sample(n=25, random_state=42)
    training_data = pd.concat([a_training, b_training])
    
    # 50 testing data objects
    testing_data = df.drop(training_data.index)
    
    training_labels = training_data['class label'].tolist()
    training_points = training_data[['age', 'wgt']].values
    test_points = testing_data[['age', 'wgt']].values
    true_labels = testing_data['class label'].tolist()
    
    return new_csv, training_points, test_points, training_labels, true_labels, training_data


In [27]:
def get_input_file_path():
    file_name = 'pre_homework4_445.csv'
    file_path = Path.home() / 'Desktop' / file_name
    return str(file_path)

def get_output_file_path():
    file_name = 'cs455_homework_4_MarinBatana-Woodruff_dataset.csv'
    file_path = Path.home() / 'Desktop' / file_name
    return str(file_path)

In [28]:
old_csv = get_input_file_path()
new_csv = get_output_file_path()
dataset, training_points, test_points, training_labels, true_labels, training_data = preprocessing(old_csv, new_csv)
print(f"Saved to: {dataset}")

Saved to: /Users/angelesmarin/Desktop/cs455_homework_4_MarinBatana-Woodruff_dataset.csv


### KNN Classification

In [29]:
def distance_euclidean(p, q):
	return math.sqrt((p[0]-q[0]) * (p[0]-q[0]) + (p[1]-q[1])*(p[1]-q[1]))

#### K=1 implementation 

In [30]:
def step_1(training_points, test_points):
    distances = [distance_euclidean(test_points, p) for p in training_points]
    return distances

In [31]:
def step_2(distances):
    distances_rank = sorted(range(len(distances)), key=lambda k: distances[k])
    return distances_rank

In [32]:
def step_3(distances_rank, training_labels, k):
    vote_for_a = 0
    vote_for_b = 0
    for i in range(k):
        if (training_labels[distances_rank[i]] == 'A'):
            vote_for_a = vote_for_a + 1
        else:
            vote_for_b = vote_for_b + 1
    if (vote_for_a > vote_for_b):
    	return 'A'
    else:
    	return 'B'

#### K=1 prediction  

In [33]:
def prediction_1NN(training_data, test_data, training_labels, k):
    prediction = []
    for p in test_data:
        distances = step_1(training_data, p)
        distances_rank = step_2(distances)
        prediction.append(step_3(distances_rank, training_labels, 1))
    return prediction

In [34]:
prediction = prediction_1NN(training_points, test_points, training_labels, 1)
print("Prediction:", prediction)

Prediction: ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B']


#### K=1 accuracy 

In [35]:
def accuracy(true_label, prediction):
    correct_a = correct_b = total_a = total_b = 0
    
    for i in range(len(true_label)):
        if true_label[i] == 'A':
            total_a += 1
            if prediction[i] == 'A':
                correct_a += 1
        elif true_label[i] == 'B':
            total_b += 1
            if prediction[i] == 'B':
                correct_b += 1
    
    accuracy_a = correct_a / total_a if total_a else 0
    accuracy_b = correct_b / total_b if total_b else 0
    average_accuracy = (correct_a + correct_b) / len(prediction)
    
    print(f"accuracy of class A: {accuracy_a * 100:.2f}%")
    print(f"accuracy of class B: {accuracy_b * 100:.2f}%")
    print(f"overall classification accuracy: {average_accuracy * 100:.2f}%")
    
    return accuracy_a, accuracy_b, average_accuracy

In [36]:
accuracy(training_labels, prediction)

accuracy of class A: 88.00%
accuracy of class B: 92.00%
overall classification accuracy: 90.00%


(0.88, 0.92, 0.9)

#### K=5 prediction  

In [37]:
def prediction_5NN(training_data, test_data, training_labels, k):
    prediction_5 = []
    for p in test_data:
        distances = step_1(training_data, p)
        distances_rank = step_2(distances)
        prediction_5.append(step_3(distances_rank, training_labels, 5))
    return prediction_5

In [38]:
prediction_5 = prediction_5NN(training_points, test_points, training_labels, 5)
print("Prediction:", prediction_5)

Prediction: ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'A', 'B', 'A', 'B', 'B', 'B', 'B', 'A', 'A']


#### K=5 accuracy 

In [39]:
def accuracy_5(true_label, prediction_5):
    correct_a = correct_b = total_a = total_b = 0
    
    for i in range(len(true_label)):
        if true_label[i] == 'A':
            total_a += 1
            if prediction_5[i] == 'A':
                correct_a += 1
        elif true_label[i] == 'B':
            total_b += 1
            if prediction_5[i] == 'B':
                correct_b += 1
    
    accuracy_a = (correct_a / total_a) if total_a else 0
    accuracy_b = (correct_b / total_b) if total_b else 0
    overall_accuracy = (correct_a + correct_b) / len(prediction_5)
    
    print(f"Accuracy for class A: {accuracy_a * 100:.2f}%")
    print(f"Accuracy for class B: {accuracy_b * 100:.2f}%")
    print(f"Overall accuracy: {overall_accuracy * 100:.2f}%")
    
    return accuracy_a, accuracy_b, overall_accuracy

In [40]:
accuracy(true_labels, prediction)

accuracy of class A: 88.00%
accuracy of class B: 92.00%
overall classification accuracy: 90.00%


(0.88, 0.92, 0.9)

### Decision Tree

In [41]:
x_train = training_data[['age', 'wgt']]
y_train = training_data['class label']
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
tree_predictions = clf.predict(test_points)
print("Decision Tree Prediction:", tree_predictions)

Decision Tree Prediction: ['A' 'A' 'A' 'A' 'A' 'A' 'A' 'B' 'B' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'B' 'B' 'B' 'B' 'A' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'A' 'B' 'A' 'B' 'B' 'B' 'B' 'B' 'B']




In [42]:
def tree_accuracy(true_label, tree_predictions):
    correct_a = correct_b = total_a = total_b = 0

    for i in range(len(true_label)):
        if true_label[i] == 'A':
            total_a += 1
            if tree_predictions[i] == 'A':
                correct_a += 1
        elif true_label[i] == 'B':
            total_b += 1
            if tree_predictions[i] == 'B':
                correct_b += 1

    accuracy_a = (correct_a / total_a) * 100 if total_a else 0
    accuracy_b = (correct_b / total_b) * 100 if total_b else 0
    overall_accuracy = (correct_a + correct_b) / len(tree_predictions) * 100

    print(f"Accuracy for class A: {accuracy_a:.2f}%")
    print(f"Accuracy for class B: {accuracy_b:.2f}%")
    print(f"Overall accuracy: {overall_accuracy:.2f}%")
    
    return accuracy_a, accuracy_b, overall_accuracy

In [43]:
accuracy_a, accuracy_b, overall_accuracy = tree_accuracy(true_labels, tree_predictions)

Accuracy for class A: 92.00%
Accuracy for class B: 88.00%
Overall accuracy: 90.00%
