In [1]:
from data_process import get_data, process_data
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

In [2]:
# read in data from csvs
(mat_headers, mat_data), (por_headers, por_data) = get_data()

# reformat data to all integers
por_data = process_data(por_data, por_headers)
mat_data = process_data(mat_data, mat_headers)

por_x = por_data[:, :-3]
por_g1 = por_data[:, -3]
por_g2 = por_data[:, -2]
por_g3 = por_data[:, -1]

mat_x = mat_data[:, :-3]
mat_g1 = mat_data[:, -3]
mat_g2 = mat_data[:, -2]
mat_g3 = mat_data[:, -1]

In [8]:
# This function will fit and test a KNN model for 1<= num_neighbors <= num_samples and return the predictions
# and confusion matrix for the most accurate num_neighbors val
def do_knn(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle = True, random_state=2)

    models = {}
    for i in range(len(X_train) - 1):
      model = KNeighborsClassifier(n_neighbors=i+1)
      models[i] = model.fit(X_train,y_train)

    accuracy = []
    for i in range(len(X_train) - 1):
      y_predict = models[i].predict(X_test)
      accuracy.append(accuracy_score(y_test,y_predict))

    most_accurate_idx = None
    most_accurate_accuracy = 0

    for idx, acc in enumerate(accuracy):
        if most_accurate_idx is None or most_accurate_accuracy < acc:
            most_accurate_accuracy = acc
            most_accurate_idx = idx

    pred = models[most_accurate_idx].predict(X_test)
    conf = confusion_matrix(y_test, pred)
    return pred, conf

## Portuguese KNN Classification

In [4]:
# will hold predictions and confusion matrices for all 3 grades
por_predictions = []
por_conf_mats = []

## For G1
por_g1_predict, por_g1_conf = do_knn(por_x, por_g1)
por_predictions.append(por_g1_predict)
por_conf_mats.append(por_g1_conf)

## For G2
por_g2_predict, por_g2_conf = do_knn(por_x, por_g2)
por_predictions.append(por_g2_predict)
por_conf_mats.append(por_g2_conf)

## For G3
por_g3_predict, por_g3_conf = do_knn(por_x, por_g3)
por_predictions.append(por_g3_predict)
por_conf_mats.append(por_g3_conf)


## Mathematics KNN Classification

In [5]:
# will hold predictions and confusion matrices for all 3 grades
mat_predictions = []
mat_conf_mats = []

## For G1
mat_g1_predict, mat_g1_conf = do_knn(mat_x, mat_g1)
mat_predictions.append(mat_g1_predict)
mat_conf_mats.append(mat_g1_conf)

## For G2
mat_g2_predict, mat_g2_conf = do_knn(mat_x, mat_g2)
mat_predictions.append(mat_g2_predict)
mat_conf_mats.append(mat_g2_conf)

## For G3
mat_g3_predict, mat_g3_conf = do_knn(mat_x, mat_g3)
mat_predictions.append(mat_g3_predict)
mat_conf_mats.append(mat_g3_conf)


## Portuguese KNN Analysis

In [6]:
g1_acc = 100 * np.diag(por_conf_mats[0]).sum() / (por_conf_mats[0].sum())
g2_acc = 100 * np.diag(por_conf_mats[1]).sum() / (por_conf_mats[1].sum())
g3_acc = 100 * np.diag(por_conf_mats[2]).sum() / (por_conf_mats[2].sum())

print("Portuguese Accuracies:")
print(f"\tFor G1: {g1_acc}%")
print(f"\tFor G2: {g2_acc}%")
print(f"\tFor G3: {g3_acc}%")

Portuguese Accuracies:
	For G1: 18.46153846153846%
	For G2: 19.487179487179485%
	For G3: 23.58974358974359%


## Mathematics KNN Analysis

In [9]:
g1_acc = 100 * np.diag(mat_conf_mats[0]).sum() / (mat_conf_mats[0].sum())
g2_acc = 100 * np.diag(mat_conf_mats[1]).sum() / (mat_conf_mats[1].sum())
g3_acc = 100 * np.diag(mat_conf_mats[2]).sum() / (mat_conf_mats[2].sum())

print("Mathematics Accuracies:")
print(f"\tFor G1: {g1_acc}%")
print(f"\tFor G2: {g2_acc}%")
print(f"\tFor G3: {g3_acc}%")

Mathematics Accuracies:
	For G1: 15.966386554621849%
	For G2: 13.445378151260504%
	For G3: 30.252100840336134%
