# group k-folds
This notebook will implement a k-fold iterator variant on non-overlapping groups.

**Step 1:** Load the dataset into a `pandas` dataframe to extract all unique `SITE_ID` values. 

In [None]:
#Import modules for this step
from nilearn import datasets
import pandas as pd
import os

#Fetch data using nilearn.datasets.fetch
abide = datasets.fetch_abide_pcp(data_dir=os.path.join(os.sep,"path/to/data"),
                                 pipeline="cpac",
                                 quality_checked=True)

#Load phenotypic data into pandas dataframe
abide_pheno = pd.DataFrame(abide.phenotypic)

#Create array to hold unique site names
#groups = abide_pheno.SITE_ID.unique()

groups = []
for s in abide_pheno.SITE_ID:
    groups.append(s.decode())

**Step 2:** Define the dataset split using built-in `scikit-learn` methods. In this case, we are using `sklearn.model_selection.GroupKFold`. 

In [None]:
#Import modules 
import numpy as np 
from sklearn.model_selection import GroupKFold
import prepare_data
import os

#Define data and output directories 
data_dir = os.path.join(os.sep,"path/to/data")
output_dir = data_dir

X, y = prepare_data.prepare_data(data_dir,output_dir)

logo = GroupKFold(n_splits=10)
logo.get_n_splits(X, y, groups)

**Step 3:** Choosing which machine learning classifier to use. We will try four different classifiers in this script. 

_**Step 3.1:**_ Support Vector Machines (SVM) - `LinearSVC`

In [3]:
from sklearn.svm import LinearSVC
import statistics
print("----------------------------------------------------")
print("GroupKFold with Linear Support Vector Classification")
print("----------------------------------------------------")

l_svc = LinearSVC(max_iter=10000)

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    l_svc.fit(X_train,y_train)
    acc_score = l_svc.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

----------------------------------------------------
GroupKFold with Linear Support Vector Classification
----------------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Finished training.

Accuracy score for model 1   0.6046511627906976
Accuracy score for model 2   0.686046511627907
Accuracy score for model 3   0.6666666666666666
Accuracy score for model 4   0.5822784810126582
Accuracy score for model 5   0.72
Accuracy score for model 6   0.5277777777777778
Accuracy score for model 7   0.676056338028169
Accuracy score for model 8   0.6956521739130435
Accuracy score for model 9   0.6626506024096386
Accuracy score for model 10   0.5232558139534884

Average accuracy score for all models:  0.6345035528180046
Maximum accuracy score of all models:  0.72
Minimum accuracy score of all models:  0.5232558139534884


_**Step 3.2:**_ _k_-Nearest Neighbors - `KNeighborsClassifier`

In [4]:
from sklearn.neighbors import KNeighborsClassifier
import statistics
print("--------------------------------------------------")
print("GroupKFold with K-Nearest Neighbors Classification")
print("--------------------------------------------------")

knn = KNeighborsClassifier()

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    knn.fit(X_train,y_train)
    acc_score = knn.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

--------------------------------------------------
GroupKFold with K-Nearest Neighbors Classification
--------------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Finished training.

Accuracy score for model 1   0.5872093023255814
Accuracy score for model 2   0.4883720930232558
Accuracy score for model 3   0.48717948717948717
Accuracy score for model 4   0.5443037974683544
Accuracy score for model 5   0.6266666666666667
Accuracy score for model 6   0.5694444444444444
Accuracy score for model 7   0.5633802816901409
Accuracy score for model 8   0.6666666666666666
Accuracy score for model 9   0.4939759036144578
Accuracy score for model 10   0.4883720930232558

Average accuracy score for all models:  0.5515570736102311
Maximum accuracy score of all models:  0.6666666666666666
Minimum accuracy score of all models:  0.48717

_**Step 3.3:**_ Decision Tree - `DecisionTreeClassifier`

In [5]:
from sklearn.tree import DecisionTreeClassifier
import statistics
print("--------------------------------------------")
print("GroupKFold with Decision Tree Classification")
print("--------------------------------------------")

dt = DecisionTreeClassifier()

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    dt.fit(X_train,y_train)
    acc_score = dt.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

--------------------------------------------
GroupKFold with Decision Tree Classification
--------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Finished training.

Accuracy score for model 1   0.5697674418604651
Accuracy score for model 2   0.5697674418604651
Accuracy score for model 3   0.44871794871794873
Accuracy score for model 4   0.5443037974683544
Accuracy score for model 5   0.5333333333333333
Accuracy score for model 6   0.5
Accuracy score for model 7   0.5352112676056338
Accuracy score for model 8   0.5507246376811594
Accuracy score for model 9   0.5662650602409639
Accuracy score for model 10   0.6162790697674418

Average accuracy score for all models:  0.5434369998535765
Maximum accuracy score of all models:  0.6162790697674418
Minimum accuracy score of all models:  0.44871794871794873


_**Step 3.4:**_ Random Forests - `RandomForestClassifier`

In [6]:
from sklearn.ensemble import RandomForestClassifier
import statistics
print("--------------------------------------------")
print("GroupKFold with Random Forest Classification")
print("--------------------------------------------")

rf = RandomForestClassifier()

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    rf.fit(X_train,y_train)
    acc_score = rf.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

--------------------------------------------
GroupKFold with Random Forest Classification
--------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Finished training.

Accuracy score for model 1   0.5930232558139535
Accuracy score for model 2   0.6046511627906976
Accuracy score for model 3   0.3974358974358974
Accuracy score for model 4   0.5189873417721519
Accuracy score for model 5   0.4666666666666667
Accuracy score for model 6   0.5416666666666666
Accuracy score for model 7   0.5774647887323944
Accuracy score for model 8   0.5217391304347826
Accuracy score for model 9   0.5301204819277109
Accuracy score for model 10   0.5116279069767442

Average accuracy score for all models:  0.5263383299217665
Maximum accuracy score of all models:  0.6046511627906976
Minimum accuracy score of all models:  0.3974358974358974
