<a href="https://colab.research.google.com/github/abdalrahman9/back_physio_imu_ml/blob/main/Analysis/Main_ML_Code_Physio_Back.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Data

In [None]:
from matplotlib import interactive, pyplot as plt
import numpy as np
import math #needed for definition of pi
import pandas as pd
import time, datetime, csv, signal
import os
import re
from seglearn.transform import InterpLongToWide
from seglearn.transform import FeatureRep, Segment, Interp
from seglearn.pipe import Pype
from bokeh.models import ColumnDataSource, Grid, LinearAxis, MultiLine, Plot, Range1d, LayoutDOM



def signal_dataframe(folder):
  items = folder.split('/')
  session_name = items[1]+"_"+items[2]+"_"+items[3]+"_"+items[4]
  '''###
  MERGE CODE
  '''###
  for id_sensor , (sensor,sensor_name) in enumerate(zip([A,B,C,D,E,F,G,H],['A','B','C','D','E','F','G','H'])):
    if 'reps' in items:
        ending = "_"+items[6]+".csv"
    else:
        ending = ".csv"


    dfa = pd.read_csv(folder+session_name+'_'+sensor.replace(':','')+'_acc'+ending) #Reading the dataset in a dataframe using Pandas
    dfg = pd.read_csv(folder+session_name+"_"+sensor.replace(':','')+"_gyro"+ending) #Reading the dataset in a dataframe using Pandas
    dfm = pd.read_csv(folder+session_name+"_"+sensor.replace(':','')+"_mag"+ending) #Reading the dataset in a dataframe using Pandas
    dfq = pd.read_csv(folder+session_name+"_"+sensor.replace(':','')+"_quat"+ending) #Reading the dataset in a dataframe using Pandas
    dfp = pd.read_csv(folder+session_name+"_"+sensor.replace(':','')+"_pres"+ending) #Reading the dataset in a dataframe using Pandas

    for signal_id , signal in enumerate([dfa,dfg,dfm,dfq,dfp]):
      #Drop Time column
      signal.drop(columns=['time (-07:00)'],inplace=True)

      #Switch order of elapsed time and epoch
      cols  = list(signal.columns)
      cols[0], cols[1] = cols[1] , cols[0]
      signal = signal[cols]

      #Interpolate the data to have all signals to 25Hz
      clf = Pype([('interp', Interp(1. / 25., categorical_target=True))])
      # print(signal[cols].columns.values[1:])
      # test = ([dfa.to_numpy(),dfg.to_numpy(),dfm.to_numpy(),dfq.to_numpy(),dfp.to_numpy()])
      signal_arr, _ = clf.fit_transform([signal.to_numpy()],[0])
      signal_arr = pd.DataFrame(data=signal_arr[0],columns=signal[cols].columns.values[1:])
      if signal_id == 0:
        dfa =  signal_arr
      elif signal_id == 1:
        dfg =  signal_arr
      elif signal_id == 2:
        dfm =  signal_arr
      elif signal_id == 3:
        dfq =  signal_arr
      else:
        dfp =  signal_arr

    #Synchronizing and merging the signals together
    test1a = pd.merge_asof(dfa, dfg, on='epoch (ms)')
    test2a = pd.merge_asof(test1a, dfm, on='epoch (ms)')
    test3a = pd.merge_asof(test2a, dfq, on='epoch (ms)')
    test4a = pd.merge_asof(test3a, dfp, on='epoch (ms)')

    df = test4a

    #Add sensor name to each of its corresponding columns
    for signal in sensor_type_col_labels:
      for axis in signal:
        df.rename(columns = {axis:axis + " - "+ sensor_name}, inplace = True)
        
    if id_sensor == 0:
      full_df = df.copy()
    else:
      full_df = pd.merge_asof(full_df, df, on='epoch (ms)')

  # Remove any NA rows and reset index before returning array
  full_df.dropna(inplace=True)
  full_df.reset_index(drop=True, inplace=True)

  #Taring epochs to first epoch entry (eg. Epoch[0] = 0 ms) 
  full_df['epoch (ms)'] = ((full_df['epoch (ms)']).sub(full_df['epoch (ms)'][0])).div(1000)

  #Convert dataframe to ndarray and return it
  return full_df.to_numpy()


################################### MAIN CODE ######################################
#1 Laying Down
#2 Sustained Extension
#4 Sagittal Extension
#5 Extension in Standing
#6 Flexion in Lying
#7 Flexion in Sitting
#8 Flexion in Standing
#9 Side Glide in Laying
#10 Side Glide in Standing
#11 Rotation Flexion Knees Together
#12 Rotation Flexion One leg stretch
#13 Other

###Define Global variables and arrays###

#Sensor MAC
A = "C7:E1:38:1F:C0:DE"
B = "F4:04:52:A2:CB:59"
C = "E3:62:1F:8B:81:B7"
D = "E8:9C:A5:A3:8A:60"
E = "F9:0E:1C:DA:D4:1D"
F = "CD:A5:4D:78:A1:B4"
G = "EF:AA:47:DC:45:44"
H = "CD:78:F1:6B:D8:67"

#Channel names
acc_col_label = ['X-Axis (g)','Y-Axis (g)','Z-Axis (g)']
gyro_col_label =['X-Axis (deg/s)','Y-Axis (deg/s)','Z-Axis (deg/s)']
mag_col_label = ['X-Axis (T)','Y-Axis (T)','Z-Axis (T)']
pres_col_label = ['Pressure (Pa)']
quat_col_label = ['W-Axis','X-Axis (i)','Y-Axis (j)','Z-Axis (k)'] #should be i j k
sensor_type_col_labels = [acc_col_label,gyro_col_label,mag_col_label,quat_col_label,pres_col_label]

#Selecting the Participant Data that wants to be analyzed 
subj_names = ['p0','p1','p2','p3','p4','p5','p6','p7','p8','p9','p10','p11','p12','p13','p14','p15','p16','p17','p18']

#Defining the data dict that will house the imported data
data = {'X': [], 'Subject': [], 'Exercise': [], 'Side': [], 'Rep': [], 'Sensor': [], 'Signal' : [], 'Axis': [], 'Y': []}

#Specify data folder
directory = "Data/"

#Important the relevent data files into the data dict
for subj in subj_names:	#Enter Subject Folder
    folder = directory + subj + "/"   #update folder string with new directory

    #Loop only with newest session with "[max(os.listdir(folder))]"
    for session in [s for s in [max(os.listdir(folder))] if os.path.isdir(folder+s)]:	#Enter Session Folder
        folder = directory + subj + "/" + session + "/"   #update folder string with new directory

        #Loop through each exercise folder except for nonrelevent folders (eg. tar & test folders)
        #Change which folders you want to exclude based on which data you want (Desire all data or only posture or exercise)
        for exercise in [e for e in os.listdir(folder) if os.path.isdir(folder+e) and e!='tar' and e!='test' and e!='offset' #and e!='pg' and e!='pfg' and e!='pb']: 
                         and e!='e1' and e!='e2' and e!='e3' and e!='e4' and e!='e5' and e!='e6' and e!='e7' and e!='random']:	#Enter Exercise Folder
            folder = directory + subj + "/" + session + "/" + exercise + "/"    #update folder string with new directory

            #Extract all Round Folders and select the newest round
            round_folders = [r for r in os.listdir(folder) if os.path.isdir(folder+r)]
            #First check 2 sided exercises
            if exercise in ['e3','e4','e7']:  #Where there are 2 sides (L/R)
                left = [place for place in round_folders if 'l' in place]
                right = [right for right in round_folders if right not in left]
                if not left:
                    round_folders = [max(right)]
                elif not right:
                    round_folders = [max(left)]
                else:
                    round_folders = [max(left),max(right)]
            #Next check One sided exercises
            else:# One sided Exercises
                round_folders = [max(round_folders)]

            #Now enter Round folder
            for rounds in round_folders:	#Enter Round Folder
                folder = directory + subj + "/" + session + "/" + exercise + "/" + rounds + "/"   #update folder string with new directory
                
                #Check if there is a reps folder
                if os.path.isdir(folder+"reps"): #Check if there is a reps folder
                    folder = directory + subj + "/" + session + "/" + exercise + "/" + rounds + "/reps/"
                    for reps in [r for r in os.listdir(folder) if os.path.isdir(folder+r)]:
                        folder = directory + subj + "/" + session + "/" + exercise + "/" + rounds + "/reps/" + reps + "/"
                        
                        #Extract data from files and label the entry with the correspoding data & tags
                        all_signals = signal_dataframe(folder)
                        data['X'].append(all_signals)
                        data['Subject'].append((re.findall(r"\d+", subj))[0])
                        data['Exercise'].append(exercise)
                        data['Rep'].append(int(reps)+1)

                        #If exercise has 2 sides, put the applicable side
                        if 'l' in rounds:
                            data['Side'].append('Left')
                        elif rounds.count('r') == 2:
                            data['Side'].append('Right')
                        else:
                            data['Side'].append('NA')
                        print("Saved "+folder)

                #For exercises with no reps (ex. pg & e1)
                else:	#No Reps Folder Found
                      #Extract data from files and label the entry with the correspoding data & tags
                      all_signals = signal_dataframe(folder)
                      data['X'].append(all_signals)
                      data['Subject'].append((re.findall(r"\d+", subj))[0])
                      data['Exercise'].append(exercise)
                      data['Rep'].append(1)
                      
                      #If exercise has 2 sides, put the applicable side
                      if 'l' in rounds:
                          data['Side'].append('Left')
                      elif rounds.count('r') == 2:
                          data['Side'].append('Right')
                      else:
                          data['Side'].append('NA')
                      print("Saved "+folder)

#Save all the labels into one key for easier manipulation later on
data['Y'] = np.column_stack([data['Subject'], data['Exercise'], data['Side'], data['Rep']])

#Output the sizes of the important data
print("Finished Saving Files")

print("# X " + str(len(data['X'])))
print("# Side " + str(len(data['Side'])))
print("# Subject " + str(len(data['Subject'])))
print("# Exercise " + str(len(data['Exercise'])))
print("# Rep " + str(len(data['Rep'])))

In [None]:
#Save data file in directory for simpler usage later on
np.save('posture_data.npy', data)

In [None]:
#Load the data file if it already exists 
data = (np.load('posture_data.npy',allow_pickle=True))[()]

## Feature Selection

In [None]:
import seglearn as sgl

#Import all features from seglearn library
d = sgl.feature_functions.all_features()

#Delete features that are not applicable or don't work
del d['hmean']  #all elements have to be positive to calc the harmonic mean -- condition not satisfied
del d['gmean']  #all elements have to be positive to calc the harmonic mean -- condition not satisfied
del d['emg_var']  #EMG var not applicable for data set/problem (IMU data)

In [None]:
#These two Functions should work but I get an error of: "dividing my zero" .. trying to fix
del d['corr'] #func not working....
del d['hist4']  #func not working....

In [None]:
def corr2(X):
    """ computes correlations between all variable pairs in a segmented time series

    .. note:: this feature is expensive to compute with the current implementation, and cannot be
    used with univariate time series
    """
    # print(X.shape)
    X = np.atleast_3d(X)
    # print(X.shape)
    N = X.shape[0]
    D = X.shape[2]

    if D == 1:
        return np.zeros(N, dtype=np.float)

    trii = np.triu_indices(D, k=1) 
    DD = len(trii[0])
    r = np.zeros((N, DD))
    for i in np.arange(N):
        rmat = np.corrcoef(X[i].transpose())  # get the ith window from each signal, result will be DxD
        r[i] = rmat[trii]
    return r

#Replace old corr function with this altered version
d['corr'] = corr2

## Pipeline Creation

Creation of sensor/signal selection class

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

#Class for selecting specific Sensors and Signals from the original data
class Sensor_Select(BaseEstimator,TransformerMixin):
   'Common base class for sensor/signal Selection'
   #Declare Sensor Indices within the 113 channel columns (originally 112 but epoch column was added thus 113)
   A = 1
   B = 15
   C = 29
   D = 43
   E = 57
   F = 71
   G = 85
   H = 99

   #Declare the relative Signal indices within each sensor segment -> 14 signals within each sensor 
   acc = [0 , 1 , 2]
   gyro = [3 , 4 , 5]
   mag = [6, 7, 8]
   quat = [9, 10, 11, 12]
   pres = [13]

   def __init__(self, sensors=[], signals=[]):
     #Init the sensor, signal arrays
     print("Entered Init")
     self.sensors = sensors
     self.signals = signals
     
     #init an empty list to hold the respective indices for the selected sensors and signals
     self.sensors_int = []
     self.signals_int = []
   
   def fit(self,X,y=None):
     print("Entered Fit")
     return self

   def transform(self,X,y=None):
     print("Entered transform")

     #Loop through the desired sensors and extract their respective indices within the 113 columns
     for dev , indices in zip(['A','B','C','D','E','F','G','H'],[[self.A],[self.B],[self.C],[self.D],[self.E],[self.F],[self.G],[self.H]]):
       if dev in self.sensors: self.sensors_int.extend(indices)

     #Now loop through the desired signals and extract their respective indices within each sensor segment
     for ch , relative_indices in zip(['acc','gyro','mag','quat','pres'], [self.acc,self.gyro,self.mag,self.quat,self.pres]):
       if ch in self.signals: self.signals_int.extend(relative_indices)
       
     #Now Combine both sensor and signal indices to extract all the desired signals from each sensor
     #I included column '0' by default within the list because it corresponds to the 'epoch' column and must
     #be included no matter what sensors/signals are desired
     Filtered_Indices = [0] + [i+ j for i in self.sensors_int for j in self.signals_int]
          
     #Now insert the desired indices (ie. columns/signals) within the main data and retrieve the desired sensors along with their signals
     filtered_data = [entry[:,Filtered_Indices] for entry in X]

     #Return the filtered data
     return filtered_data

Running Pipe

In [None]:
import seglearn as sgl
from sklearn.preprocessing import StandardScaler

#Create the pype with all the main processing elements
clf = sgl.Pype([('sensors', Sensor_Select(['A','B','C','D','E','F','G','H'],['acc','gyro','mag','quat','pres'])),
                ('interp', Interp(1. / 25., categorical_target=True)),
                ('segment', sgl.Segment(width=25,overlap=0.3)),
                ('ftr', sgl.FeatureRep(features=d)),
                ('scaler', StandardScaler())])



In [None]:
#Run the data through the pipe
X, y = clf.fit_transform(data['X'], data['Y'])

  Xt = np.array([sliding_tensor(Xt[i], self.width, self._step, self.order)


## Cross Validation and Grid Searching

In [None]:
#Import important libraries and functions
import itertools
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import seglearn as sgl
from seglearn.transform import FeatureRep, Segment, Interp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.python.keras.layers import Dense, LSTM, Conv1D
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.python.keras.utils import np_utils
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, log_loss
from sklearn.metrics import classification_report, precision_recall_fscore_support
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import f1_score, make_scorer, balanced_accuracy_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit,
                                     StratifiedKFold, GroupShuffleSplit,
                                     GroupKFold, StratifiedShuffleSplit)
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Define 2 main Functions used for Grid Searching and Cross Validation

#1 - The Plot confusion matrix function that takes in pred and true labels and 
# outputs the corresponding confusion matrix -- originally implemented by David 
# burns in his Seglearn Library
def plot_confusion_matrix(cm, classes,
                          normalize=True,
                          cmap=plt.cm.Blues):
    """ plots confusion matrix """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()


#2 - The Plot Grid Search function takes the results of a grid search that had only 
# 2 hyperparameters that were tunes and outputs the relationship between the 2 hyperparameters
# and the model's accuracy in the form of a plot
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
    # plotting grid results from David Alvarez on Stack Overflow

    # Get Test Scores Mean and std for each grid search
    scores_mean = cv_results['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(grid_param_2), len(grid_param_1))
    scores_sd = cv_results['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(grid_param_2), len(grid_param_1))

    # Plot Grid search scores
    _, ax = plt.subplots(1, 1)

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):
        ax.plot(grid_param_1, scores_mean[idx, :], '-o', label=name_param_2 + ': ' + str(val))

    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid(True)

Cross Validation

In [None]:
def run_model(model, alg_name, notation):
    # # use subject id to group folds
    splitter = GroupKFold(n_splits=n_splits)
    cvk = splitter.split(X,y[:,1],groups=y[:,0])
    cvk_list = list(cvk)

    ##############################################################
    # Cross Validation and Calculate Multiple Metrics
    cv_scores = cross_validate(model, X[:,:],y[:,1], groups=y[:,0], cv=cvk_list, return_train_score=True, return_estimator=True)#scoring=scoring) 

    # Predictions
    y_pred = np.empty(y[:,1].shape[0],dtype=y[:,1].dtype.str)
    for idx, fold_model in enumerate(cv_scores['estimator']):
      train_idx,test_idx = cvk_list[idx]#next(cvk)
      y_pred[test_idx] = fold_model.predict(X[test_idx])

    ##############################################################
    # # Output Results
    print(alg_name)

    pd_cv = pd.DataFrame(cv_scores)

    print("CV Average Scores: ")
    for metric in pd_cv.loc[:,pd_cv.columns != 'estimator']:
      print(metric + ": %0.2f (+/- %0.2f)" % (pd_cv[metric].mean(), pd_cv[metric].std() * 2))

    print("\nCV Iteration Scores: ")  
    print(pd_cv[['fit_time','test_score','train_score']])
    print('\n')

    print("Classification Report: ")
    print(classification_report(y[:,1], y_pred))

    print("Confusion Matrix: ")
    cm = confusion_matrix(y[:,1], y_pred)
    plot_confusion_matrix(cm,np.unique(y[:,1]))

    return cv_scores

##############################################################################

n_splits = 6
cv_models = {}

# ----- Random Forest ---------------
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(bootstrap=False,
 max_depth= 57,
 max_features='auto',
 min_samples_leaf= 1,
 min_samples_split= 20,
 n_estimators= 100)
    #n_estimators = 100, min_samples_split = 10, min_samples_leaf = 4, bootstrap=False, max_features='sqrt')
name = 'rf'
cv_models[name] = run_model(model, "Random Forest", 'rf')

# # ----- xgboost ------------
# # install xgboost
# # 'pip install xgboost' or https://stackoverflow.com/questions/33749735/how-to-install-xgboost-package-in-python-windows-platform/39811079#39811079
# from xgboost import XGBClassifier

# model = XGBClassifier()
# notation = 'xg'
# cv_models[notation] = run_model(model, "XGBoost", 'xg')

#############################################
#Save cv_models and results in object file for future reference in needed
# np.save('/home/abdallah/Documents/Cross_Validate_Best_3_Models_Segment_exercise_no_hist_corr.npy', cv_models)

Grid Searching

In [None]:
#######################################GRID SEARCHING ###################################################################
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
    # plotting grid results from David Alvarez on Stack Overflow

    # Get Test Scores Mean and std for each grid search
    scores_mean = cv_results['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(grid_param_2), len(grid_param_1))
    scores_sd = cv_results['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(grid_param_2), len(grid_param_1))

    # Plot Grid search scores
    _, ax = plt.subplots(1, 1)

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):
        ax.plot(grid_param_1, scores_mean[idx, :], '-o', label=name_param_2 + ': ' + str(val))

    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid(True)

def run_model(model, alg_name, notation):
    ##############################################################
    # # Output Model Name
    print(alg_name)

    splitter = GroupKFold(n_splits=n_splits)
    cvk = splitter.split(data['X'],data['Y'][:,1],groups=data['Y'][:,0])
    cvk_list = list(cvk)

    # clf = sgl.Pype([#('sensors', Sensor_Select(['A','B','C','D','E','F','G','H'],['acc','gyro','mag','quat','pres'])),
    #             ('interp', Interp(1. / 25., categorical_target=True)),
    #             ('seg', sgl.Segment()),
    #             ('ftr', sgl.FeatureRep(features=d)),
    #             ('scaler', StandardScaler()),
    #             (notation , model)])
    
    #################Segment Hyperparameters#######################
    # par_grid = {'seg__width': [12 ,25, 50, 75, 100, 125],
    #         'seg__overlap': [0., 0.2, 0.4, 0.6]}

    ################# Random Forest Hyperparameters #######################
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 6)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 200, num = 5)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 10, 20, 30]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4, 8, 15]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    # par_grid = {'n_estimators': n_estimators,
    #               'max_features': max_features,
    #               'max_depth' : max_depth,
    #               'min_samples_split': min_samples_split,
    #               'min_samples_leaf': min_samples_leaf,
    #               'bootstrap': bootstrap}

    ################# XGBoost Hyperparameters #######################
    par_grid = {
    'learning_rate': [0.05, 0.10, 0.20, 0.30],
    'max_depth': [3, 7, 15],
    'min_child_weight': [1, 4, 7],
    'subsample': [0.5, 0.7],
    'gamma' : [ 0.0, 0.2 , 0.4 ],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'n_estimators' : [100, 200, 500],
    'objective': ['reg:squarederror'] }

    #Grid Searching through the parameters
    gs = GridSearchCV(estimator = model, param_grid = par_grid, cv=cvk_list, n_jobs = -1,return_train_score=True) #scoring='f1_weighted', return_train_score=True,
    # gs = RandomizedSearchCV(estimator = model, param_distributions = par_grid, n_iter = 100, cv = cvk_list, random_state=42, n_jobs = -1, return_train_score=True)

    gs.fit(X, y[:,1], groups = y[:,0])


    return gs

##############################################################################
n_splits = 6
cv_models = {}

# ----- Random Forest ---------------
# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier()
# name = 'rf'
# cv_models[name] = run_model(model, "Random Forest", 'rf')

# # ----- xgboost ------------
# # install xgboost
# # 'pip install xgboost' or https://stackoverflow.com/questions/33749735/how-to-install-xgboost-package-in-python-windows-platform/39811079#39811079
from xgboost import XGBClassifier

model = XGBClassifier()
notation = 'xg'
cv_models[notation] = run_model(model, "XGBoost", 'xg')

#############################################
#Save cv_models and results in object file for future reference in needed
np.save('/home/abdallah/Documents/Grid_Search_XGBoost_exercise_no_hist_corr.npy', cv_models)