# ML manual k-fold testing
This notebook involves testing for k-folding functions towards the aim of creating a function that handles categorical and continous variables (which must be made categorical)

### import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interactive
import seaborn as sns

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import StratifiedShuffleSplit
import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config
import cvasl.harmony as har

# Part one: the specific example that mattered to us, splitting on age and sedx

### import data , clean and prep

In [None]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 
filepath_top = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top = os.path.join(filepath_top,'TOP_pvc2c.csv') 
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)
TOP = TOP.drop(TOP.columns[0],axis=1)
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)

# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
TOP.head(3)

In [None]:
StrokeMRI.head(3)

## add binned column on age

In [None]:
StrokeMRI = sep.bin_dataset(StrokeMRI, 'age', num_bins=4, graph = False)


## Build ML models based on StrokeMRI

In [None]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
ml_matrix

In [None]:
type(ml_matrix['binned'][0])

In [None]:
ml_matrix['fuse_bin']= ml_matrix['sex']* len(ml_matrix['binned'].unique()) + pd.to_numeric(ml_matrix['binned'])

### So now we have a category where 0 to 3 are sex category 0 progressing in age, and 4 to 7 are sex category 1 progressing in age

In [None]:
linr_k_frame, linr_y_frame, models = sep.stratified_one_category_shuffle_split('linear regression', 'unharm_mri_linr', LinearRegression(), ml_matrix, X, y, category=['fuse_bin'], printed=True)

In [None]:
linr_k_frame

In [None]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

In [None]:
linr_y_frame

In [None]:
linr = models[0]
linr[0]

# Now we will do the same thing with TOP but build this into a more general function

In [None]:

def stratified_cat_and_cont_categories_shuffle_split(
        model_name,
        model_file_name,
        scikit_model,
        our_ml_matrix,
        our_x,
        our_y,
        cat_category='sex',
        cont_category='age',
        splits=5,
        test_size_p=0.25,
        printed=False
):
    """
    This takes a sci-kit learn coded model and
    creates a dataframe based on (stratified) k-folds of results on
    our_ml_matrix, and it's X component
    returns a dataframe of fold results
    and raw y_test versus y_pred
    as well as a tuple with models
    and then the training data from the model.
    This is a twist on Stratified Shuffle Split
    to allow it's stratification on a categorical
    and continous variable.
    The random state in the StratifiedShuffleSplit is set, so
    the results should be reproducible.

    :param model_name: name of model
    :type model_name: str
    :param model_file_name: name offile where specific model will be stored
    :type model_file_name: str
    :param skikit_model: name of skikit-model
    :type skikit_model: str
    :param our_ml_matrix: dataframe to work over
    :type our_ml_matrix: `~pd.DataFrame`
    :param our_x: X or features columnfor machine learning
    :type our_x: dataframe
    :param our_y: y or label column for machine learning
    :type our_y: class:`~pandas.core.series.Series`
    :param cat_category: categorical variable (column) to be stratified on eg. sex
    :type cat_category: str
    :param cont_category: continuuous variable (column) to be stratified on eg. age
    :type cont_category: str
    :param splits: number of folds desired
    :type splits: int
    :param test_size_p: percent to put into test
    :type test_size_p: float
    :param printed: printed information on folds option
    :type printed: bool


    :returns: dataframe, y dataframe, and models
    :rtype: tuple
    """
    our_ml_matrix = sep.bin_dataset(our_ml_matrix, cont_category, num_bins=4, graph = False)
    our_ml_matrix['fuse_bin']= our_ml_matrix[cat_category]* len(our_ml_matrix['binned'].unique()) + pd.to_numeric(our_ml_matrix['binned'])
    y_split = our_ml_matrix['fuse_bin'].values
    sss = StratifiedShuffleSplit(
        n_splits=splits,
        test_size=test_size_p,
        random_state=12
    )

    X = our_x
    # TODO: (makeda)finish split and put back index so everything is traceable
    y = our_y
    sss.get_n_splits(X, y_split)

    unique, counts = np.unique(y_split, return_counts=True)

    y_frame = []
    all_mod_results = []
    models = []
    for i, (train_index, test_index) in enumerate(sss.split(X, y_split)):
        unique, counts = np.unique(y_split[train_index], return_counts=True)
        unique, counts = np.unique(y_split[test_index], return_counts=True)
        cols = [
            'algorithm',
            'fold',
            'file_name',
            'mae',
            'r2',
            'explained_variance',
        ]
        mod_results = pd.DataFrame(columns=cols)
        current_fold_X_train = X[train_index][:, 1:]
        current_fold_y_train = y[train_index]
        current_fold_X_test = X[test_index][:, 1:]
        current_fold_y_test = y[test_index]
        scikit_model.fit(current_fold_X_train, current_fold_y_train)
        current_fold_y_pred = scikit_model.predict(current_fold_X_test)
        if printed:
            print(f"\nFold {i}:")
            print(
                f'Train shapes: X {X[train_index].shape}',
                f' y {y[train_index].shape}'
            )
            unique_train, counts_train = np.unique(
                y_split[train_index], return_counts=True
            )
            bins = our_ml_matrix['binned']
            print(
                f'Category classes: {unique_train}',
                f'Made of categorical: {our_ml_matrix[cat_category].unique()} ',
                f'and continous binned to: {bins.unique()} ',
                f'percentages: {100*counts_train/y[train_index].shape[0]}',
            )
            print(
                f'\nTest shapes: X {X[test_index].shape}',
                f'  y {y[test_index].shape}'
            )
            unique_test, counts_test = np.unique(
                y_split[test_index], return_counts=True
            )
            print(
                f'Category classes: {unique_test},'
                f'percentages: {100*counts_test/y[test_index].shape[0]}'
            )

        data = [[
            f'{model_name}-{i}',
            i,
            f'{model_file_name}.{i}',
            mean_absolute_error(current_fold_y_test, current_fold_y_pred),
            scikit_model.score(current_fold_X_test, current_fold_y_test),
            metrics.explained_variance_score(
                current_fold_y_test,
                current_fold_y_pred
            )]]
        mod_results_current_fold = pd.DataFrame(data, columns=cols)
        mod_results = pd.concat([mod_results, mod_results_current_fold])
        mod_results.reset_index(drop=True, inplace=True)
        all_mod_results.append(mod_results)
        y_frame_now = pd.DataFrame(
            {
                'y_test': list(current_fold_y_test),
                'y_pred': list(current_fold_y_pred),
            })

        y_frame.append(y_frame_now)

        models.append((scikit_model, X[train_index][:, 0]))

    df = pd.concat(all_mod_results)
    y_frame = pd.concat([
        y_frame[0],
        y_frame[1],
        y_frame[2],
        y_frame[3],
        y_frame[4],
    ], axis=0)
    return df, y_frame, models


In [None]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
linr_k_frame, linr_y_frame, models = stratified_cat_and_cont_categories_shuffle_split('linear regression', 'unharm_mri_linr', LinearRegression(), ml_matrix, X, y, printed=True)

# 