In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Q1

In [2]:
# Load dataset
w3classif = pd.read_csv('w3classif.csv')

In [3]:
n_Shuffles = 10

def create_train_test_data(test_size=0.3):
  # For storing data
  trains, tests = [], []

  for i in range(n_Shuffles):
    train_data, test_data = train_test_split(w3classif, test_size=test_size, shuffle=True)

    # Store data
    trains.append((train_data.iloc[:, 0:-1], train_data.iloc[:, -1]))
    tests.append((test_data.iloc[:, 0:-1], test_data.iloc[:, -1]))

  return trains, tests

# Q2

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [5]:
def repeat_knn(trials=10, test_size=0.3):

  # For storing
  train_losses, test_losses = [], []

  # Create train and test datasets (calling the function you created previously)
  trains, tests = create_train_test_data(test_size)

  for i in range(trials):
    X_train, y_train = trains[i]
    X_test, y_test = tests[i]

    
    knn_classif = KNeighborsClassifier(n_neighbors=3)

    knn_classif.fit(X_train, y_train)

    train_pred = knn_classif.predict(X_train)
    test_pred = knn_classif.predict(X_test)

    train_acc = accuracy_score(y_pred=train_pred, y_true=y_train)
    test_acc = accuracy_score(y_pred=test_pred, y_true=y_test)

    train_loss = 1 - train_acc
    test_loss = 1 - test_acc

    train_losses.append(train_loss)
    test_losses.append(test_loss)


  return train_losses, test_losses

In [6]:
# Print the average training and test losses for 10 trials using the function implemented above
train_losses, test_losses = repeat_knn(trials=10, test_size=0.3)
print(f'Avg Training Loss (Misclassification Rate): {np.array(train_losses).mean() * 100:.2f}%')
print(f'Avg Test Loss (Misclassification Rate): {np.array(test_losses).mean() * 100:.2f}%')

Avg Training Loss (Misclassification Rate): 2.71%
Avg Test Loss (Misclassification Rate): 4.33%


# Q3

In [7]:
test_sizes = [0.1, 0.3, 0.5, 0.7]

for ts in test_sizes:
  train_losses, test_losses = repeat_knn(trials=10, test_size=ts)
  print(f'\tfor a test size that is {ts * 100}% of the dataset')
  print(f'\tAvg Training Loss (Misclassification Rate): {np.array(train_losses).mean() * 100:.2f}%')
  print(f'\tAvg Test Loss (Misclassification Rate): {np.array(test_losses).mean() * 100:.2f}%')
  print(f'<================================================================>')

	for a test size that is 10.0% of the dataset
	Avg Training Loss (Misclassification Rate): 2.72%
	Avg Test Loss (Misclassification Rate): 4.50%
	for a test size that is 30.0% of the dataset
	Avg Training Loss (Misclassification Rate): 2.82%
	Avg Test Loss (Misclassification Rate): 4.67%
	for a test size that is 50.0% of the dataset
	Avg Training Loss (Misclassification Rate): 2.75%
	Avg Test Loss (Misclassification Rate): 4.60%
	for a test size that is 70.0% of the dataset
	Avg Training Loss (Misclassification Rate): 2.83%
	Avg Test Loss (Misclassification Rate): 4.61%


# Q4

In [8]:
test_sizes = [0.1, 0.3, 0.5, 0.7]

for ts in test_sizes:
  # TODO: Calculate the sample standard deviation of your training and test set error values over the 10 trials 
  train_losses, test_losses = repeat_knn(trials=10, test_size=ts)
  print(f'\tfor a test size that is {ts * 100}% of the dataset')
  print(f'\tStandard Deviation of train set (Misclassification Rate): {np.array(train_losses).std():.2f}')
  print(f'\tStandard Deviation of test set (Misclassification Rate): {np.array(test_losses).mean():.2f}')
  print(f'<================================================================>')

	for a test size that is 10.0% of the dataset
	Standard Deviation of train set (Misclassification Rate): 0.00
	Standard Deviation of test set (Misclassification Rate): 0.05
	for a test size that is 30.0% of the dataset
	Standard Deviation of train set (Misclassification Rate): 0.01
	Standard Deviation of test set (Misclassification Rate): 0.04
	for a test size that is 50.0% of the dataset
	Standard Deviation of train set (Misclassification Rate): 0.01
	Standard Deviation of test set (Misclassification Rate): 0.04
	for a test size that is 70.0% of the dataset
	Standard Deviation of train set (Misclassification Rate): 0.01
	Standard Deviation of test set (Misclassification Rate): 0.05


# Q5

In [9]:
from sklearn.model_selection import cross_val_score

shuffle_w3classif = w3classif.sample(frac=1, random_state=42).reset_index()
X = shuffle_w3classif.iloc[:, :-1]
y = shuffle_w3classif.iloc[:, -1]

knn_classif = KNeighborsClassifier(n_neighbors=3)

num_folds = 10
cross_validation = cross_val_score(estimator=knn_classif, X=X, y=y, cv=num_folds)

print(f"{cross_validation.mean()}")
print(f"{cross_validation.std()}")

0.9949999999999999
0.010000000000000009
