# Generalized polynomial based work

This notebook covers comparing two datasets, one of which will be considered the "base" dataset, and one will be the dataset which we compare to.
The environment is the normal cvasl environment (mrilander).

### import needed libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import glob
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)


import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.insert(0, '../') # path to functions
from cvasl import file_handler as fh # 
from cvasl import mold #
from cvasl import carve
from cvasl import seperated
from cvasl.file_handler import Config

### get data

In [None]:
# set up data pull
config = Config()
root_mri_directory = config.get_directory('raw_data')

## Setting the base and comapared datasets
In this example we will use the TOP dataset as our base, and mriStroke as the other dataset.
This is highly problematic for anything stratified by gender, but we will overlook that for now,
as both datasets have about 50% women.

In [None]:
base = os.path.join(root_mri_directory, 'assembled/top_stitched.csv')
compared = os.path.join(root_mri_directory, 'assembled/StrokeMRI_stitched.csv')
# in the future the below should be base_data and the tricks will skip
our_top_data = pd.read_csv(base)
dataframe_compared = pd.read_csv(compared)

### temporary trick to deal with data inconsistency

In [None]:
our_top_data['GM_vol'] = our_top_data['GM_vol_Liter']
our_top_data['WM_vol'] = our_top_data['WM_vol_Liter']
our_top_data['CSF_vol'] = our_top_data['CSF_vol_Liter']
our_top_data['GM_ICVRatio'] = our_top_data['GM_ICVRatio_ratio GM/ICV'] 
our_top_data['WMH_vol'] = our_top_data['GMWM_ICVRatio_ratio (GM+WM)/ICV']
our_top_data['WMH_count'] = our_top_data['WMH_count_n lesions (integer)']
our_top_data['DeepWM_B'] = our_top_data['DeepWM_B_SD/mean']
our_top_data['DeepWM_L'] = our_top_data['DeepWM_L_SD/mean']
our_top_data['DeepWM_L'] = our_top_data['DeepWM_R_SD/mean']
our_top_data['ACA_B']= our_top_data['ACA_B_SD/mean']    
our_top_data['ACA_L']= our_top_data['ACA_L_SD/mean']           
our_top_data['ACA_R']= our_top_data['ACA_R_SD/mean']            
our_top_data['MCA_B']= our_top_data['MCA_B_SD/mean']      

In [None]:
# now we find common columns; this will be easier when all is same formatted
shared_columns = (
        dataframe_compared.columns.intersection(our_top_data.columns)).to_list()

In [None]:
shared_columns

### create base polynomials

In [None]:
# find common columns

In [None]:
special_data_column = 'Age'

In [None]:
other_columns = [
    'GM_vol',
     'WM_vol',
     'CSF_vol',
     'GM_ICVRatio',
     'WMH_vol',
     'WMH_count',
     'DeepWM_B',
     'DeepWM_L',
     'ACA_B',
     'ACA_L',
     'ACA_R',
     'MCA_B', ]

In [None]:
our_top_data

In [None]:
# for s_column in other_columns :
#     print(s_column)
#     seperated.polyfit_and_show(
#         our_top_data,#dataframe_base,
#         special_data_column,
#         s_column,
#         2,
#         color1='purple',
# )
    

In [None]:
# below functions must go into main library

In [None]:


def polyfit_second_degree_to_df(
        dataframe,
        special_column_name,
        other_column_names,
):
    """
    This function creates a polynomial for two columns.
    It returns the coefficients
    
    :param dataframe: dataframe variable
    :type dataframe: pandas.dataFrame
    :param special_column_name: string of column you want to graph against
    :type  special_column_name: str
    :param other_column_name: string of column you want to graph
    :type other_column_name: str
    :param degree_poly: either 1,2 or 3 only
    :type  degree_poly: int


    :returns: coeffiects
    :rtype: :class:`~numpy.ndarray`
    """
    list_as = []
    list_bs = []
    list_cs = []
    list_columns = []
    dataframe = dataframe.dropna()
    for interest_column_name in other_column_names:
        xscat = np.array(pd.to_numeric(dataframe[special_column_name]))
        yscat = np.array(pd.to_numeric(dataframe[interest_column_name]))
        coefficients = np.polyfit(xscat, yscat, 2 ) #2 = degree_poly
        list_columns.append(interest_column_name)
        list_as.append(coefficients[0])
        list_bs.append(coefficients[1])
        list_cs.append(coefficients[2])
    d = {'column':list_columns,'coefficient_a':list_as, 'coefficient_b':list_bs, 'coefficient_c':list_cs}
    coefficien_dataframe = pd.DataFrame(d)
   
    return coefficien_dataframe


In [None]:
def derived_function(column,a,b,c):
    list_output = []
    for value in column:
        output = a * (value**2) + b* value + c
        list_output.append(output)
    return list_output


In [None]:
cos_dataframe = polyfit_second_degree_to_df(
        our_top_data,#dataframe_base,
        special_data_column,
        other_columns,
)
cos_dataframe

In [None]:
#our_top_data

In [None]:
#for each column, make an estimated value
# estimated_by = derived_function(our_top_data['Age'], -1.18633655e-05, -2.15086048e-03,  7.95379583e-01)
# len(estimated_by)

In [None]:
projected_columns = []
for column in our_top_data[shared_columns].columns:
    #print(column +'_projected')
    projected_columns.append((column +'_projected'))
    #print(projected_columns)
our_top_data[projected_columns] = 0
our_top_data

In [None]:
#cos_dataframe[1]['coefficient_a']
cos_dataframe.loc[cos_dataframe['column'] == "GM_vol", 'coefficient_a']

In [None]:
# extract multipliers and make new columns
coef_a = cos_dataframe.loc[cos_dataframe['column'] == "GM_vol", 'coefficient_a']
coef_b = cos_dataframe.loc[cos_dataframe['column'] == "GM_vol", 'coefficient_b']
coef_c = cos_dataframe.loc[cos_dataframe['column'] == "GM_vol", 'coefficient_c']
our_top_data['GM_vol_projected'] = derived_function(our_top_data['Age'], coef_a, coef_b,  coef_c)

In [None]:
#for column in our_top_data[projected_columns].columns:


In [None]:
coef_a = cos_dataframe.loc[cos_dataframe['column'] == "WM_vol", 'coefficient_a']
coef_b = cos_dataframe.loc[cos_dataframe['column'] == "WM_vol", 'coefficient_b']
coef_c = cos_dataframe.loc[cos_dataframe['column'] == "WM_vol", 'coefficient_c']
our_top_data['WM_vol_projected'] = derived_function(our_top_data['Age'], coef_a, coef_b,  coef_c)

In [None]:
coef_a = cos_dataframe.loc[cos_dataframe['column'] == "WMH_vol", 'coefficient_a']
coef_b = cos_dataframe.loc[cos_dataframe['column'] == "WMH_vol", 'coefficient_b']
coef_c = cos_dataframe.loc[cos_dataframe['column'] == "WMH_vol", 'coefficient_c']
our_top_data['WMH_vol_projected'] = derived_function(our_top_data['Age'], coef_a, coef_b,  coef_c)