# Generalized polynomial based work

This notebook covers comparing two datasets, one of which will be considered the "base" dataset, and one will be the dataset which we compare to.
The environment is the normal cvasl environment (mrilander).

### import needed libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import glob
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)


import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.insert(0, '../') # path to functions
from cvasl import file_handler as fh # 
from cvasl import mold #
from cvasl import carve
from cvasl import seperated
from cvasl.file_handler import Config

### get data

In [2]:
# set up data pull
config = Config()
root_mri_directory = config.get_directory('raw_data')

## Setting the base and comapared datasets
In this example we will use the TOP dataset as our base, and mriStroke as the other dataset.
This is highly problematic for anything stratified by gender, but we will overlook that for now,
as both datasets have about 50% women.

In [3]:
base = os.path.join(root_mri_directory, 'assembled/top_stitched.csv')
compared = os.path.join(root_mri_directory, 'assembled/StrokeMRI_stitched.csv')
# in the future the below should be base_data and the tricks will skip
our_top_data = pd.read_csv(base)
dataframe_compared = pd.read_csv(compared)

### temporary trick to deal with data inconsistency

In [4]:
our_top_data['GM_vol'] = our_top_data['GM_vol_Liter']
our_top_data['WM_vol'] = our_top_data['WM_vol_Liter']
our_top_data['CSF_vol'] = our_top_data['CSF_vol_Liter']
our_top_data['GM_ICVRatio'] = our_top_data['GM_ICVRatio_ratio GM/ICV'] 
our_top_data['WMH_vol'] = our_top_data['GMWM_ICVRatio_ratio (GM+WM)/ICV']
our_top_data['WMH_count'] = our_top_data['WMH_count_n lesions (integer)']
our_top_data['DeepWM_B'] = our_top_data['DeepWM_B_SD/mean']
our_top_data['DeepWM_L'] = our_top_data['DeepWM_L_SD/mean']
our_top_data['DeepWM_L'] = our_top_data['DeepWM_R_SD/mean']
our_top_data['ACA_B']= our_top_data['ACA_B_SD/mean']    
our_top_data['ACA_L']= our_top_data['ACA_L_SD/mean']           
our_top_data['ACA_R']= our_top_data['ACA_R_SD/mean']            
our_top_data['MCA_B']= our_top_data['MCA_B_SD/mean']      

In [5]:
# now we find common columns; this will be easier when all is same formatted
shared_columns = (
        dataframe_compared.columns.intersection(our_top_data.columns)).to_list()

In [6]:
shared_columns

['Unnamed: 0',
 'GM_vol',
 'WM_vol',
 'CSF_vol',
 'GM_ICVRatio',
 'WMH_vol',
 'WMH_count',
 'renumber',
 'DeepWM_B',
 'DeepWM_L',
 'ACA_B',
 'ACA_L',
 'ACA_R',
 'MCA_B',
 'Sex',
 'Age']

### create base polynomials

In [7]:
# find common columns

In [8]:
special_data_column = 'Age'

In [9]:
other_columns = [
    'GM_vol',
     'WM_vol',
     'CSF_vol',
     'GM_ICVRatio',
     'WMH_vol',
     'WMH_count',
     'DeepWM_B',
     'DeepWM_L',
     'ACA_B',
     'ACA_L',
     'ACA_R',
     'MCA_B', ]

In [10]:
our_top_data

Unnamed: 0.1,Unnamed: 0,index_,GM_vol_Liter,session_...,LongitudinalTimePoint_integer,WM_vol_Liter,WMH_count_n lesions (integer),WMH_vol_mL,CSF_vol_Liter,GMWM_ICVRatio_ratio (GM+WM)/ICV,...,CSF_vol,GM_ICVRatio,WMH_vol,WMH_count,DeepWM_B,DeepWM_L,ACA_B,ACA_L,ACA_R,MCA_B
0,0,sub-0001_1,0.71736,ASL_1,1,0.52803,24.0,1.743,0.31812,0.79653,...,0.31812,0.45881,0.79653,24.0,8.7620,8.9388,2.2748,2.3388,2.1197,2.3205
1,1,sub-0002_1,0.72383,ASL_1,1,0.62394,23.0,1.629,0.25673,0.83999,...,0.25673,0.45112,0.83999,23.0,9.0749,10.2840,1.7564,1.7518,1.7082,2.3989
2,2,sub-0019_1,0.71224,ASL_1,1,0.53295,13.0,0.621,0.33594,0.78753,...,0.33594,0.45046,0.78753,13.0,8.8791,8.8246,1.9850,1.9140,2.0353,1.8702
3,3,sub-0020_1,0.67042,ASL_1,1,0.43458,23.0,0.736,0.25974,0.80968,...,0.25974,0.49124,0.80968,23.0,6.2080,6.1139,1.8657,1.8980,1.8358,1.9092
4,4,sub-0022_1,0.63666,ASL_1,1,0.46938,22.0,2.009,0.22144,0.83319,...,0.22144,0.47960,0.83319,22.0,7.4561,7.3614,1.6049,1.5923,1.6123,2.0383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,526,sub-1163_1,0.78201,ASL_1,1,0.49489,18.0,1.431,0.24608,0.83842,...,0.24608,0.51347,0.83842,18.0,8.2103,8.7134,1.9598,1.8279,1.9940,2.1176
527,527,sub-1165_1,0.71537,ASL_1,1,0.48601,18.0,1.425,0.27123,0.81582,...,0.27123,0.48578,0.81582,18.0,6.6858,7.0912,1.4851,1.4708,1.5045,1.9477
528,528,sub-1167_1,0.73473,ASL_1,1,0.53549,20.0,3.935,0.35985,0.77924,...,0.35985,0.45074,0.77924,20.0,4.5665,4.3038,1.8432,1.8513,1.8286,2.0176
529,529,sub-1168_1,0.71094,ASL_1,1,0.60974,26.0,4.170,0.37480,0.77894,...,0.37480,0.41931,0.77894,26.0,9.5267,9.0904,1.7798,1.6765,1.8759,1.9444


In [11]:
# for s_column in other_columns :
#     print(s_column)
#     seperated.polyfit_and_show(
#         our_top_data,#dataframe_base,
#         special_data_column,
#         s_column,
#         2,
#         color1='purple',
# )
    

In [12]:
# below functions must go into main library

In [13]:


def polyfit_second_degree_to_df(
        dataframe,
        special_column_name,
        other_column_names,
):
    """
    This function creates a polynomial for two columns.
    It returns the coefficients
    
    :param dataframe: dataframe variable
    :type dataframe: pandas.dataFrame
    :param special_column_name: string of column you want to graph against
    :type  special_column_name: str
    :param other_column_name: string of column you want to graph
    :type other_column_name: str
    :param degree_poly: either 1,2 or 3 only
    :type  degree_poly: int


    :returns: coeffiects
    :rtype: :class:`~numpy.ndarray`
    """
    list_as = []
    list_bs = []
    list_cs = []
    list_columns = []
    dataframe = dataframe.dropna()
    for interest_column_name in other_column_names:
        xscat = np.array(pd.to_numeric(dataframe[special_column_name]))
        yscat = np.array(pd.to_numeric(dataframe[interest_column_name]))
        coefficients = np.polyfit(xscat, yscat, 2 ) #2 = degree_poly
        list_columns.append(interest_column_name)
        list_as.append(coefficients[0])
        list_bs.append(coefficients[1])
        list_cs.append(coefficients[2])
    d = {'column':list_columns,'coefficient_a':list_as, 'coefficient_b':list_bs, 'coefficient_c':list_cs}
    coefficien_dataframe = pd.DataFrame(d)
   
    return coefficien_dataframe


In [14]:
def derived_function(column, a, b, c):
    return a * (column**2) + b * column + c


In [15]:
cos_dataframe = polyfit_second_degree_to_df(
        our_top_data,#dataframe_base,
        special_data_column,
        other_columns,
)
cos_dataframe

Unnamed: 0,column,coefficient_a,coefficient_b,coefficient_c
0,GM_vol,-1.2e-05,-0.002151,0.79538
1,WM_vol,-8.5e-05,0.006655,0.406386
2,CSF_vol,6e-06,0.00248,0.203789
3,GM_ICVRatio,1.9e-05,-0.003429,0.556799
4,WMH_vol,-1.6e-05,-0.000725,0.851483
5,WMH_count,0.003235,-0.182226,24.661956
6,DeepWM_B,-0.002006,0.10191,6.774059
7,DeepWM_L,-0.002127,0.113351,6.488748
8,ACA_B,6.6e-05,-0.005784,1.973782
9,ACA_L,9.1e-05,-0.00869,2.049764


In [16]:
projected_columns = []
coefficients = ['coefficient_a', 'coefficient_b', 'coefficient_c']
for column in our_top_data[shared_columns].columns:
    projected_columns.append(column + '_projected')
    row = cos_dataframe[cos_dataframe['column'] == column]
    if row.empty:
        # The columns that appear "weird" below (eg. `Series([], dtype: float64)`)
        # are the columns not found in `cos_dataframe`, so they don't have associated coefficients..
        print('skipping', column)
        continue
    a, b, c = row[coefficients].values.flatten().tolist()
    our_top_data[column + '_projected'] = derived_function(our_top_data['Age'], a, b, c)
our_top_data

skipping Unnamed: 0
skipping renumber
skipping Sex
skipping Age


Unnamed: 0.1,Unnamed: 0,index_,GM_vol_Liter,session_...,LongitudinalTimePoint_integer,WM_vol_Liter,WMH_count_n lesions (integer),WMH_vol_mL,CSF_vol_Liter,GMWM_ICVRatio_ratio (GM+WM)/ICV,...,CSF_vol_projected,GM_ICVRatio_projected,WMH_vol_projected,WMH_count_projected,DeepWM_B_projected,DeepWM_L_projected,ACA_B_projected,ACA_L_projected,ACA_R_projected,MCA_B_projected
0,0,sub-0001_1,0.71736,ASL_1,1,0.52803,24.0,1.743,0.31812,0.79653,...,0.322716,0.443130,0.790570,22.854760,7.411559,7.396210,1.846598,1.844713,1.823944,1.985095
1,1,sub-0002_1,0.72383,ASL_1,1,0.62394,23.0,1.629,0.25673,0.83999,...,0.307362,0.452970,0.800926,22.427466,7.734274,7.710629,1.848708,1.851013,1.822728,2.000969
2,2,sub-0019_1,0.71224,ASL_1,1,0.53295,13.0,0.621,0.33594,0.78753,...,0.290004,0.465604,0.811855,22.150655,7.972657,7.931341,1.855560,1.864434,1.824292,2.028135
3,3,sub-0020_1,0.67042,ASL_1,1,0.43458,23.0,0.736,0.25974,0.80968,...,0.261105,0.490517,0.828053,22.219716,8.044643,7.952605,1.878449,1.902960,1.834446,2.097050
4,4,sub-0022_1,0.63666,ASL_1,1,0.46938,22.0,2.009,0.22144,0.83319,...,0.305082,0.454536,0.802410,22.378311,7.773433,7.747980,1.849331,1.852386,1.822751,2.003966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,526,sub-1163_1,0.78201,ASL_1,1,0.49489,18.0,1.431,0.24608,0.83842,...,0.253190,0.498257,0.832018,22.363795,7.987625,7.876662,1.887430,1.917334,1.839011,2.121521
527,527,sub-1165_1,0.71537,ASL_1,1,0.48601,18.0,1.425,0.27123,0.81582,...,0.294477,0.462189,0.809121,22.200222,7.924574,7.888686,1.853323,1.860312,1.823579,2.020163
528,528,sub-1167_1,0.73473,ASL_1,1,0.53549,20.0,3.935,0.35985,0.77924,...,0.294649,0.462060,0.809014,22.202443,7.922529,7.886838,1.853244,1.860162,1.823556,2.019869
529,529,sub-1168_1,0.71094,ASL_1,1,0.60974,26.0,4.170,0.37480,0.77894,...,0.328174,0.439918,0.786740,23.045868,7.272795,7.258820,1.846697,1.843670,1.824935,1.981203


In [23]:
shared_columns_new =  ['GM_vol',
 'WM_vol',
 'CSF_vol',
 'GM_ICVRatio',
 'WMH_vol',
 'WMH_count',
 'DeepWM_B',
 'DeepWM_L',
 'ACA_B',
 'ACA_L',
 'ACA_R',
 'MCA_B',]

In [24]:
difference_columns = []
for column in our_top_data[shared_columns_new].columns:
    difference_columns.append(column+ '_diff')
    our_top_data[column + '_diff'] = our_top_data[column] - our_top_data[column + '_projected']
our_top_data    

Unnamed: 0.1,Unnamed: 0,index_,GM_vol_Liter,session_...,LongitudinalTimePoint_integer,WM_vol_Liter,WMH_count_n lesions (integer),WMH_vol_mL,CSF_vol_Liter,GMWM_ICVRatio_ratio (GM+WM)/ICV,...,WMH_count_diff,renumber_diff,DeepWM_B_diff,DeepWM_L_diff,ACA_B_diff,ACA_L_diff,ACA_R_diff,MCA_B_diff,Sex_diff,Age_diff
0,0,sub-0001_1,0.71736,ASL_1,1,0.52803,24.0,1.743,0.31812,0.79653,...,1.145240,sub-0001_1,1.350441,1.542590,0.428202,0.494087,0.295756,0.335405,1,43.49
1,1,sub-0002_1,0.72383,ASL_1,1,0.62394,23.0,1.629,0.25673,0.83999,...,0.572534,sub-0002_1,1.340626,2.573371,-0.092308,-0.099213,-0.114528,0.397931,0,38.30
2,2,sub-0019_1,0.71224,ASL_1,1,0.53295,13.0,0.621,0.33594,0.78753,...,-9.150655,sub-0019_1,0.906443,0.893259,0.129440,0.049566,0.211008,-0.157935,1,32.30
3,3,sub-0020_1,0.67042,ASL_1,1,0.43458,23.0,0.736,0.25974,0.80968,...,0.780284,sub-0020_1,-1.836643,-1.838705,-0.012749,-0.004960,0.001354,-0.187850,0,21.97
4,4,sub-0022_1,0.63666,ASL_1,1,0.46938,22.0,2.009,0.22144,0.83319,...,-0.378311,sub-0022_1,-0.317333,-0.386580,-0.244431,-0.260086,-0.210451,0.034334,0,37.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,526,sub-1163_1,0.78201,ASL_1,1,0.49489,18.0,1.431,0.24608,0.83842,...,-4.363795,sub-1163_1,0.222675,0.836738,0.072370,-0.089434,0.154989,-0.003921,0,19.06
527,527,sub-1165_1,0.71537,ASL_1,1,0.48601,18.0,1.425,0.27123,0.81582,...,-4.200222,sub-1165_1,-1.238774,-0.797486,-0.368223,-0.389512,-0.319079,-0.072463,0,33.86
528,528,sub-1167_1,0.73473,ASL_1,1,0.53549,20.0,3.935,0.35985,0.77924,...,-2.202443,sub-1167_1,-3.356029,-3.583038,-0.010044,-0.008862,0.005044,-0.002269,1,33.92
529,529,sub-1168_1,0.71094,ASL_1,1,0.60974,26.0,4.170,0.37480,0.77894,...,2.954132,sub-1168_1,2.253905,1.831580,-0.066897,-0.167170,0.050965,-0.036803,1,45.31


In [None]:
# #our_top_data[['Age', 'GM_vol', 'GM_vol_projected']].sort_values(by='Age')
# plt.scatter(our_top_data['Age'],our_top_data[ 'GM_vol'])
# plt.scatter(our_top_data['Age'],our_top_data[ 'GM_vol_projected'])

In [None]:
# #cos_dataframe[1]['coefficient_a']
# cos_dataframe.loc[cos_dataframe['column'] == "GM_vol", 'coefficient_a']

In [None]:
# # extract multipliers and make new columns
# coef_a = cos_dataframe.loc[cos_dataframe['column'] == "GM_vol", 'coefficient_a']
# coef_b = cos_dataframe.loc[cos_dataframe['column'] == "GM_vol", 'coefficient_b']
# coef_c = cos_dataframe.loc[cos_dataframe['column'] == "GM_vol", 'coefficient_c']
# our_top_data['GM_vol_projected'] = derived_function(our_top_data['Age'], coef_a, coef_b,  coef_c)

In [None]:
#for column in our_top_data[projected_columns].columns:


In [None]:
# coef_a = cos_dataframe.loc[cos_dataframe['column'] == "WM_vol", 'coefficient_a']
# coef_b = cos_dataframe.loc[cos_dataframe['column'] == "WM_vol", 'coefficient_b']
# coef_c = cos_dataframe.loc[cos_dataframe['column'] == "WM_vol", 'coefficient_c']
# our_top_data['WM_vol_projected'] = derived_function(our_top_data['Age'], coef_a, coef_b,  coef_c)

In [None]:
# coef_a = cos_dataframe.loc[cos_dataframe['column'] == "WMH_vol", 'coefficient_a']
# coef_b = cos_dataframe.loc[cos_dataframe['column'] == "WMH_vol", 'coefficient_b']
# coef_c = cos_dataframe.loc[cos_dataframe['column'] == "WMH_vol", 'coefficient_c']
# our_top_data['WMH_vol_projected'] = derived_function(our_top_data['Age'], coef_a, coef_b,  coef_c)