In [1]:
# from google.colab import drive
import numpy as np

# makes printing more human-friendly
np.set_printoptions(precision=3,suppress=True)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# a) Load data
from google.colab import drive

colab = True  # Set to True if using colab
if colab:
    # May require changing paths to file
    drive.mount('/content/drive')
    with open('/content/drive/MyDrive/Colab Notebooks/data/Ex_PC_data.csv', 'r') as f:
      data = np.genfromtxt(f,delimiter=',')
else:
    # May require changing paths to file
    with open('Ex_PC_data.csv', 'r') as f:
      data = np.genfromtxt(f,delimiter=',')

X = data[:,:-1]
y = data[:,-1]

unique_values, counts = np.unique(y, return_counts=True)
counts

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


array([59, 71, 48])

In [None]:
# b) number of samples, features dimension, the number of classes

num_samples = len(X)
num_feats = len(data[0]) - 1
num_classes = len(np.unique(y))
num_samples_per_class = {value: count for value, count in zip(unique_values, counts)}

print(f'num of samples: {num_samples}')
print(f'num of feature dimensions: {num_feats}')
print(f'num of classes: {num_classes}')
for cls in num_samples_per_class:
  print(f'class {cls} has {num_samples_per_class[cls]} samples')


num of samples: 178
num of feature dimensions: 13
num of classes: 3
class 1.0 has 59 samples
class 2.0 has 71 samples
class 3.0 has 48 samples


In [None]:
# c) check nan, data imputation
from sklearn.impute import KNNImputer

if np.sum(np.isnan(X)):
  print('Total of NaN before imputation:', np.sum(np.isnan(X)))
  X = KNNImputer(n_neighbors=2, weights="uniform").fit_transform(X)
  print('Total of NaN after imputation:', np.sum(np.isnan(X)))
else:
  print('no NaN')

Total of NaN before imputation: 6
Total of NaN after imputation: 0


#### Q) How are the missing values completed when using KNNImputer?
A) it seems like they're computed by using KNN. it uses the mean of the n_neighbors nearest neighbors from the training set (which was 2 in this case), and replaces the NaN value with that calculated mean.

In [None]:
# d) partition 80/20
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

print('training data size: ', X_train.shape[0])
print('testing data size: ', X_test.shape[0])

training data size:  142
testing data size:  36


In [None]:
# e) standardize to -5 to 5
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-5, 5))

X_train_min = np.min(X_train)
X_train_max = np.max(X_train)
X_train_standardized = scaler.fit_transform(X_train)
print('min training data in each dimension, after standardization:', np.min(X_train_standardized, axis=0))
print('max training data in each dimension, after standardization:', np.max(X_train_standardized, axis=0))

# Warning: When standardizing the test set, we should use statistics like min or max computed from the training set.
X_test_standardized = scaler.fit_transform(X_test)
print('min testing data in each dimension, after standardization:', np.min(X_test_standardized, axis=0))
print('max testing data in each dimension, after standardization:', np.max(X_test_standardized, axis=0))

min training data in each dimension, after standardization: [-5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.]
max training data in each dimension, after standardization: [5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.]
min testing data in each dimension, after standardization: [-5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.]
max testing data in each dimension, after standardization: [5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.]


In [None]:
# f) standardize to 0 mean, unit variance
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_mean = np.mean(X_train)
X_train_std = np.std(X_train)
X_train_standardized = scaler.fit_transform(X_train)
print('mean training data in each dimension, after standardization:', np.mean(X_train_standardized, axis=0))
print('std training data in each dimension, after standardization:', np.std(X_train_standardized, axis=0))

# Warning: When standardizing the test set, we should use statistics like min or max computed from the training set.
X_test_standardized = scaler.fit_transform(X_test)
print('mean testing data in each dimension, after standardization:', np.mean(X_test_standardized, axis=0))
print('std testing data in each dimension, after standardization:', np.std(X_test_standardized, axis=0))

mean training data in each dimension, after standardization: [ 0. -0.  0.  0. -0. -0. -0.  0. -0.  0. -0. -0.  0.]
std training data in each dimension, after standardization: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
mean testing data in each dimension, after standardization: [-0. -0. -0. -0.  0.  0. -0.  0. -0.  0. -0.  0.  0.]
std testing data in each dimension, after standardization: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [None]:
# g) k fold
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

# Count the class distributions in each partition
for i, (train_index, val_index) in enumerate(kf.split(X_train)):
  print(f"Fold {i}:")
  print(f"  Train: index={train_index}")
  print(f"  Test:  index={val_index}")

Fold 0:
  Train: index=[ 29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46
  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64
  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82
  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100
 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
 137 138 139 140 141]
  Test:  index=[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28]
Fold 1:
  Train: index=[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  58  59  60  61  62  63  64
  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82
  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100
 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
 119 120 121 122 123 124

In [None]:
if colab:
    drive.flush_and_unmount()