# Polynomial Harmonization algorithm

## Import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import glob
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew

import sys
sys.path.insert(0, '../') # path to our library functions

from cvasl import seperated


### get data

In [None]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 

filepath_top = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top = os.path.join(filepath_top,'TOP_pvc2c.csv') 

In [None]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [None]:
TOP = TOP.drop(TOP.columns[0],axis=1)
#TOP
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI

In [None]:
TOP

In [None]:
StrokeMRI

In [None]:
# identify polynomial columns
poly_columns = ['gm_vol', 'wm_vol', 'csf_vol',
       'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']

In [None]:


def polyfit_second_degree_to_df(
        dataframe,
        special_column_name,
        other_column_names,
):
    """
    This function creates a polynomial for two columns.
    It returns the coefficients
    
    :param dataframe: dataframe variable
    :type dataframe: pandas.dataFrame
    :param special_column_name: string of column you want to graph against
    :type  special_column_name: str
    :param other_column_names: list of column you want to get poly coefficientson
    :type other_column_names: list



    :returns: coeffiects
    :rtype: :class:`~numpy.ndarray`
    """
    list_as = []
    list_bs = []
    list_cs = []
    list_columns = []
    dataframe = dataframe.dropna()
    for interest_column_name in other_column_names:
        xscat = np.array(pd.to_numeric(dataframe[special_column_name]))
        yscat = np.array(pd.to_numeric(dataframe[interest_column_name]))
        coefficients = np.polyfit(xscat, yscat, 2 ) #2 = degree_poly
        list_columns.append(interest_column_name)
        list_as.append(coefficients[0])
        list_bs.append(coefficients[1])
        list_cs.append(coefficients[2])
    d = {'column':list_columns,'coefficient_a':list_as, 'coefficient_b':list_bs, 'coefficient_c':list_cs}
    coefficient_dataframe = pd.DataFrame(d)
   
    return coefficient_dataframe


In [None]:
TOP_polys = polyfit_second_degree_to_df(
        TOP,
        'age',
        poly_columns,
)

In [None]:
MRI_polys = polyfit_second_degree_to_df(
        StrokeMRI,
        'age',
        poly_columns,
)
MRI_polys

In [None]:
# col_list = []
# transformer_list = []
# poly_columns = ['coefficient_a','coefficient_b','coefficient_c']
# poly_columns_top = {
#     original: 'top_{}'.format(original) for original in poly_columns
# }
# poly_columns_top_renamed = list(poly_columns_top.values())
# poly_columns_mri = {
#     original: 'mri_{}'.format(original) for original in poly_columns
# }
# poly_columns_mri_renamed = list(poly_columns_mri.values())
# top_polys_selection = TOP_polys[poly_columns]
# mri_polys_selection = MRI_polys[poly_columns]
# together = top_polys_selection.copy()
# together.rename(columns=poly_columns_top, inplace=True)
# together = together.assign(**{
#     col_name: MRI_polys[col_name] for col_name in poly_columns
# })
# together.rename(columns=poly_columns_mri, inplace=True)

# def tf_matrix(row):
#     return seperated.generate_transformation_matrix(row[poly_columns_top_renamed], row[poly_columns_mri_renamed])

# transformers = together.apply(tf_matrix, axis=1)
# transformers

# Home made Harmonizartion aglorithm #1

1. take polynomials of top and mri
2. assume we want to harmonize all values as if they had been shot on the strokeMRI machine
3. take the difference betwen top point and top polynomial for each point
4. make harmonized TOp dataset by adding the differences to the stroke MRI polynomial (at the same age)
5. test the harmonized for 'ML harmony'

In [None]:
# our_top_data = TOP
# shared_columns = poly_columns
# cos_dataframe= cos_dataframe = polyfit_second_degree_to_df(
#         our_top_data,#dataframe_base,
#         'age',
#         poly_columns,
# )
# cos_dataframe

In [None]:
MRI_polys

In [None]:
def derived_function(column, a, b, c):
    return a * (column**2) + b * column + c


coefficients = ['coefficient_a', 'coefficient_b', 'coefficient_c']
for column in TOP[poly_columns].columns:

    row = TOP_polys[TOP_polys['column'] == column]
    if row.empty:
        # The columns that appear "weird" below (eg. `Series([], dtype: float64)`)
        # are the columns not found, so they don't have associated coefficients..
        print('skipping', column)
        continue
    a, b, c = row[coefficients].values.flatten().tolist()
    TOP[column + '_projected'] = derived_function(TOP['age'], a, b, c)
TOP

In [None]:
#TOP['age']

In [None]:

coefficients = ['coefficient_a', 'coefficient_b', 'coefficient_c']
for column in TOP[poly_columns].columns:
    row = MRI_polys[MRI_polys['column'] == column]
    a_mri, b_mri, c_mri = row[coefficients].values.flatten().tolist()
    TOP[column + '_diff'] = TOP[column] - TOP[column + '_projected']
    TOP[column + '_mri_projected'] = TOP[column + '_diff'] + derived_function(TOP['age'], a_mri, b_mri, c_mri)


In [None]:
TOP

In [None]:
TOP_mri_homemade_harmonized = TOP[[
    'participant_id',
    'age',
    'sex',
    'gm_vol_mri_projected',
    'wm_vol_mri_projected', 
    'csf_vol_mri_projected', 
    'gm_ivc_ratio_mri_projected', 
    'gmwm_ivc_ratio_mri_projected',
    'wmh_vol_mri_projected',
    'wmh_count_mri_projected', 
    'deepwm_b_cov_mri_projected', 
    'aca_b_cov_mri_projected',
    'mca_b_cov_mri_projected',
    'pca_b_cov_mri_projected', 
    'totalgm_b_cov_mri_projected', 
    'deepwm_b_mri_projected',
    'aca_b_mri_projected',
    'mca_b_mri_projected', 
    'pca_b_mri_projected',
    'totalgm_b_mri_projected',]]
len(TOP_mri_homemade_harmonized )

In [None]:
TOP_mri_homemade_harmonized =TOP_mri_homemade_harmonized.rename(columns={
    'gm_vol_mri_projected':'gm_vol',
    'wm_vol_mri_projected':'wm_vol', 
    'csf_vol_mri_projected':'csf_vol',
    'gm_ivc_ratio_mri_projected':'gm_ivc_ratio', 
    'gmwm_ivc_ratio_mri_projected':'gmwm_ivc_ratio',
    'wmh_vol_mri_projected':'wmh_vol',
    'wmh_count_mri_projected':'wmh_count', 
    'deepwm_b_cov_mri_projected': 'deepwm_b_cov', 
    'aca_b_cov_mri_projected':'aca_b_cov',
    'mca_b_cov_mri_projected':'mca_b_cov',
    'pca_b_cov_mri_projected':'pca_b_cov', 
    'totalgm_b_cov_mri_projected':'totalgm_b_cov', 
    'deepwm_b_mri_projected':'deepwm_b',
    'aca_b_mri_projected':'aca_b',
    'mca_b_mri_projected':'mca_b', 
    'pca_b_mri_projected':'pca_b',
    'totalgm_b_mri_projected':'totalgm_b',
})
len(TOP_mri_homemade_harmonized )

In [None]:
TOP_mri_homemade_harmonized 

In [None]:
TOP_mri_homemade_harmonized.to_csv('TOP_mri_homemade_harmonized.csv') 