In [9]:
import nibabel as nib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.utils import shuffle
from scipy.stats import uniform
from scipy.stats import randint
import cv2

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from extract_data import extract_data

np.random.seed(52384)

In [10]:
def pca_reduce(x_train, x_test, num_components):
  # Standardize the data
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(x_train)
  X_test_scaled = scaler.transform(x_test)

  # Perform PCA
  pca = PCA(n_components=num_components)
  X_train_pca = pca.fit_transform(X_train_scaled)
  X_test_pca = pca.transform(X_test_scaled)
  return X_train_pca, X_test_pca

## Object 0: scissor vs shoe

In [11]:
# extract train and test data
x_train, y_train, x_test, y_test = extract_data('../ds000105_R2.0.2', object=0)
x_train2 = x_train.reshape(40*64*x_train.shape[2], x_train.shape[3])
x_test2 = x_test.reshape(40*64*x_train.shape[2], x_test.shape[3])
x_train2 = x_train2.T
x_test2 = x_test2.T
mean_train = np.mean(x_train2, axis=0)
std_train = np.std(x_train2, axis=0)
x_train_norm = (x_train2 - mean_train) / std_train
x_test_norm = (x_test2 - mean_train) / std_train

In [12]:
# denoise: Gaussian smoothing
x_train_gs = cv2.GaussianBlur(x_train_norm, (5, 5), 0)
x_test_gs = cv2.GaussianBlur(x_test_norm, (5, 5), 0)
# PCA reduce order
# x_train_pca, x_test_pca=pca_reduce(x_train_gs,x_test_gs,150)
x_train_pca, x_test_pca=pca_reduce(x_train_norm,x_test_norm,150)

In [13]:
svm = SVC(random_state=42)
hyper_param_svm = {
    'C': np.logspace(-3, 5, 9),
    'kernel': ['poly','rbf'],
    'degree': [1, 2, 3, 4],
    'gamma': ['scale', 'auto'] + list(np.logspace(-8, 3, 12)),
}
model_cv_svm = RandomizedSearchCV(svm,
                        param_distributions = hyper_param_svm,
                        n_iter = 100,
                        scoring = 'accuracy',
                        cv = 10, 
                        n_jobs = -1)

In [14]:
# rf = RandomForestClassifier(random_state=42)
# hyper_param_rf = {
#     'n_estimators': randint(50, 500),
#     'max_depth': [None] + list(range(5, 30, 5)),
#     'min_samples_split': randint(2, 20),
#     'min_samples_leaf': randint(1, 20),
#     'max_features': ['sqrt', 'log2'] + [x_train_gs.shape[1]],
#     'criterion': ['gini', 'entropy'],
#     'bootstrap': [True, False]
# }
# model_cv_rf = RandomizedSearchCV(rf, 
#                         param_distributions = hyper_param_rf,
#                         n_iter = 100,
#                         scoring = 'accuracy',
#                         cv = 10, 
#                         n_jobs = -1)

In [15]:
knn = KNeighborsClassifier()
hyper_param_knn = {'n_neighbors': range(1,150,10),
                   'weights': ['uniform', 'distance']}
model_cv_knn = GridSearchCV(estimator=knn,
                        param_grid = hyper_param_knn,
                        scoring = 'accuracy', 
                        cv = 10, 
                        n_jobs = -1)

In [16]:
lr = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
hyper_param_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],}
model_cv_lr = GridSearchCV(estimator=lr,
                        param_grid = hyper_param_lr,
                        scoring = 'accuracy', 
                        cv = 10, 
                        n_jobs = -1)

### 1. Gaussian smoothing

#### 1.1 SVM

In [17]:
search1 = model_cv_svm.fit(x_train_gs, y_train)
search1.best_params_

In [None]:
print('The best average accuracy over CV is:')
print(search1.cv_results_['mean_test_score'][search1.cv_results_['rank_test_score']==1][0])

The best average accuracy over CV is:
0.6308823529411764


In [None]:
model = SVC(kernel=search1.best_params_.get('kernel'), gamma=search1.best_params_.get('gamma'), 
            C=search1.best_params_.get('C'), degree=search1.best_params_.get('degree'), random_state=42)
model.fit(x_train_gs, y_train)

In [None]:
y_pred = model.predict(x_test_gs)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.5660377358490566
              precision    recall  f1-score   support

         0.0       0.54      0.56      0.55        25
         1.0       0.59      0.57      0.58        28

    accuracy                           0.57        53
   macro avg       0.57      0.57      0.57        53
weighted avg       0.57      0.57      0.57        53



#### 1.2 KNN

In [None]:
search3 = model_cv_knn.fit(x_train_gs, y_train)
search3.best_params_

{'n_neighbors': 1, 'weights': 'uniform'}

In [None]:
print('The best average accuracy over CV is:')
print(search3.cv_results_['mean_test_score'][search3.cv_results_['rank_test_score']==1][0])

The best average accuracy over CV is:
0.5709558823529413


In [None]:
model = KNeighborsClassifier(n_neighbors=search3.best_params_.get('n_neighbors'),
                           weights=search3.best_params_.get('weights'))
model.fit(x_train_gs, y_train)

In [None]:
y_pred = model.predict(x_test_gs)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6037735849056604
              precision    recall  f1-score   support

         0.0       0.58      0.60      0.59        25
         1.0       0.63      0.61      0.62        28

    accuracy                           0.60        53
   macro avg       0.60      0.60      0.60        53
weighted avg       0.60      0.60      0.60        53



#### 1.3 Logistic Regression

In [None]:
search10 = model_cv_lr.fit(x_train_gs, y_train)
search10.best_params_

{'C': 1000}

In [None]:
print('The best average accuracy over CV is:')
print(search10.cv_results_['mean_test_score'][search10.cv_results_['rank_test_score']==1][0])

The best average accuracy over CV is:
0.6128676470588236


In [None]:
model = LogisticRegression(C=search10.best_params_.get('C'), penalty='l1', solver='liblinear', random_state=42)
model.fit(x_train_gs, y_train)

In [None]:
y_pred = model.predict(x_test_gs)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6037735849056604
              precision    recall  f1-score   support

         0.0       0.58      0.60      0.59        25
         1.0       0.63      0.61      0.62        28

    accuracy                           0.60        53
   macro avg       0.60      0.60      0.60        53
weighted avg       0.60      0.60      0.60        53



### 2 PCA

#### 2.1 SVM

In [None]:
search4 = model_cv_svm.fit(x_train_pca, y_train)
search4.best_params_

{'kernel': 'poly', 'gamma': 1e-05, 'degree': 1, 'C': 100000.0}

In [None]:
print('The best average accuracy over CV is:')
print(search4.cv_results_['mean_test_score'][search4.cv_results_['rank_test_score']==1][0])

The best average accuracy over CV is:
0.9566176470588236


In [None]:
model = SVC(kernel=search4.best_params_.get('kernel'), gamma=search4.best_params_.get('gamma'), 
            C=search4.best_params_.get('C'), degree=search4.best_params_.get('degree'), random_state=42)
model.fit(x_train_pca, y_train)

In [None]:
y_pred = model.predict(x_test_pca)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9056603773584906
              precision    recall  f1-score   support

         0.0       0.86      0.96      0.91        25
         1.0       0.96      0.86      0.91        28

    accuracy                           0.91        53
   macro avg       0.91      0.91      0.91        53
weighted avg       0.91      0.91      0.91        53



#### 2.3 KNN

In [None]:
search6 = model_cv_knn.fit(x_train_pca, y_train)
search6.best_params_

{'n_neighbors': 1, 'weights': 'uniform'}

In [None]:
print('The best average accuracy over CV is:')
print(search6.cv_results_['mean_test_score'][search6.cv_results_['rank_test_score']==1][0])

The best average accuracy over CV is:
0.9448529411764707


In [None]:
model = KNeighborsClassifier(n_neighbors=search6.best_params_.get('n_neighbors'),
                           weights=search6.best_params_.get('weights'))
model.fit(x_train_pca, y_train)

In [None]:
y_pred = model.predict(x_test_pca)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9245283018867925
              precision    recall  f1-score   support

         0.0       1.00      0.84      0.91        25
         1.0       0.88      1.00      0.93        28

    accuracy                           0.92        53
   macro avg       0.94      0.92      0.92        53
weighted avg       0.93      0.92      0.92        53



#### 2.4 Logistic Regression

In [None]:
search11 = model_cv_lr.fit(x_train_pca, y_train)
search11.best_params_

{'C': 10}

In [None]:
print('The best average accuracy over CV is:')
print(search11.cv_results_['mean_test_score'][search11.cv_results_['rank_test_score']==1][0])

The best average accuracy over CV is:
0.9444852941176471


In [None]:
model = LogisticRegression(C=search11.best_params_.get('C'), penalty='l1', solver='liblinear', random_state=42)
model.fit(x_train_pca, y_train)

In [None]:
y_pred = model.predict(x_test_pca)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8679245283018868
              precision    recall  f1-score   support

         0.0       0.85      0.88      0.86        25
         1.0       0.89      0.86      0.87        28

    accuracy                           0.87        53
   macro avg       0.87      0.87      0.87        53
weighted avg       0.87      0.87      0.87        53



### 3 No smoothing and PCA

#### 3.1 SVM

In [None]:
search7 = model_cv_svm.fit(x_train_norm, y_train)
search7.best_params_

{'kernel': 'poly', 'gamma': 0.01, 'degree': 3, 'C': 10.0}

In [None]:
print('The best average accuracy over CV is:')
print(search7.cv_results_['mean_test_score'][search7.cv_results_['rank_test_score']==1][0])

The best average accuracy over CV is:
0.9319852941176471


In [None]:
model = SVC(kernel=search7.best_params_.get('kernel'), gamma=search7.best_params_.get('gamma'), 
            C=search7.best_params_.get('C'), degree=search7.best_params_.get('degree'), random_state=42)
model.fit(x_train_norm, y_train)

In [None]:
y_pred = model.predict(x_test_norm)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9056603773584906
              precision    recall  f1-score   support

         0.0       0.92      0.88      0.90        25
         1.0       0.90      0.93      0.91        28

    accuracy                           0.91        53
   macro avg       0.91      0.90      0.91        53
weighted avg       0.91      0.91      0.91        53



#### 3.2 KNN

In [None]:
search9 = model_cv_knn.fit(x_train_norm, y_train)
search9.best_params_

{'n_neighbors': 1, 'weights': 'uniform'}

In [None]:
print('The best average accuracy over CV is:')
print(search9.cv_results_['mean_test_score'][search9.cv_results_['rank_test_score']==1][0])

The best average accuracy over CV is:
0.9507352941176471


In [None]:
model = KNeighborsClassifier(n_neighbors=search9.best_params_.get('n_neighbors'),
                           weights=search9.best_params_.get('weights'))
model.fit(x_train_norm, y_train)

In [None]:
y_pred = model.predict(x_test_norm)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9245283018867925
              precision    recall  f1-score   support

         0.0       1.00      0.84      0.91        25
         1.0       0.88      1.00      0.93        28

    accuracy                           0.92        53
   macro avg       0.94      0.92      0.92        53
weighted avg       0.93      0.92      0.92        53



#### 3.3 Logistic Regression

In [None]:
search12 = model_cv_lr.fit(x_train_norm, y_train)
search12.best_params_

{'C': 100}

In [None]:
print('The best average accuracy over CV is:')
print(search12.cv_results_['mean_test_score'][search12.cv_results_['rank_test_score']==1][0])

The best average accuracy over CV is:
0.9507352941176471


In [None]:
model = LogisticRegression(C=search12.best_params_.get('C'), penalty='l1', solver='liblinear', random_state=42)
model.fit(x_train_norm, y_train)

In [None]:
y_pred = model.predict(x_test_norm)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9245283018867925
              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92        25
         1.0       0.93      0.93      0.93        28

    accuracy                           0.92        53
   macro avg       0.92      0.92      0.92        53
weighted avg       0.92      0.92      0.92        53

