# Assembling the TOP dataset

### Import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
import glob
import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

## Read data into pandas dataframe

How do we define which files should be used together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, we will will use the first option, approved by Mathijs on 26th June 2023

In [None]:
experiment_folder= 'TOP'

In [None]:
config = Config()
root_directory = config.get_directory('raw_data')
if os.path.isdir(os.path.join(root_directory, experiment_folder)):
    print("this folder exists, we will take tsv from here")
else: 
    print("this folder does not seem to exist, try typing again")

In [None]:
root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files

In [None]:
# Read files in selected folder into dataframes
dataframes = [pd.read_csv(file, sep='\t', header=[0]) for file in tsv_files]
# make a sample
sample_df = dataframes[2]
cols = sample_df.columns
#look at sample
sample_df

## Below we will see how many different columns should be in our final dataframe except sex and age, which we add from anotehr frame

In [None]:
list_all_column = []
for frame in dataframes:
    list_columns = frame.columns.to_list()
    for x in list_columns:
        if x not in list_all_column:
            list_all_column.append(x)
len(list_all_column)

In [None]:
#len(list_all_column)

### Below we take all the dataframes and put them into tsv files in a specific folder

In [None]:
numbr = 0
for frame in dataframes:
    filepath = '../open_work/internal_results/stitchy' 
    filename = os.path.join(filepath,str(numbr+1)) 
    if not os.path.exists(filepath):
    # if filder doesn't exist, create it
        os.makedirs(filepath)
    frame.to_csv((filename +'.tsv'), sep="\t")
    numbr +=1

## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [None]:
tsv_folder_made = '../open_work/internal_results/stitchy'
identical_columns = sep.check_identical_columns(tsv_folder_made)

In [None]:
identical_columns

In [None]:
#sample_df.columns

In [None]:
stitched = sample_df[identical_columns].copy()

In [None]:
n_identical = stitched.shape[1]

In [None]:
stitched

### Here is where we add the different parts to stitched

In [None]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched.columns

In [None]:
len(stitched.columns)

## Here we can should get rid of second visits, but what we see is that session was not in the common columns. We will get rid of all second time points, and people ending in _2. And mention to scientists

In [None]:
#stitched.columns

In [None]:
stitched = stitched[1:]
#stitched

In [None]:
stitched['session'].unique()

In [None]:
stitched['LongitudinalTimePoint'].unique()

### so due the fact they are all on first visit, first session, we can write this into p[atient id LATER

## So in this dataset we have one longitudinal timepoint, and one type of session. no need to filter down away from this

In [None]:
stitched.columns

In [None]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_sex_TOP.csv')
sexage_df = pd.read_csv(sexage_path, index_col=0)
sexage_df['renumber'] = sexage_df.index
sexage_df['renumber'] = sexage_df['renumber'].apply(str)
sexage_df

## Here we take the patient ID and align it with our other frame's index

In [None]:
stitched = stitched.reset_index(drop=False)
stitched

In [None]:
len(stitched.participant_id.unique())

In [None]:
stitched['renumber'] = stitched['participant_id']

In [None]:
sexage_df = sexage_df.reset_index(drop=True)
sexage_df

In [None]:
#stitched.columns

In [None]:
result = stitched.merge(sexage_df, on="renumber")
result

In [None]:
result

In [None]:
# just in case there are duplicates in there
result = result.loc[:,~result.columns.duplicated()].copy()

In [None]:
result

## Conform file to new standard

### example of new standard (from M.D. on 23/08/2023)

In [None]:
standard_path = '../researcher_interface/sample_sep_values/showable_standard.csv'


In [None]:
standard = pd.read_csv(standard_path)
set_standard = set(standard.columns.to_list())

In [None]:
result.columns = result.columns.str.lower()
set_results= set(result.columns.to_list())

z = set_results.intersection(set_standard) 
#z

In [None]:
# # show values that are not as in template, may need recoding
# set_results - z

In [None]:
#standard.columns

In [None]:
#result.columns

In [None]:
# shift column 'Name' to first position
first_column = result.pop('participant_id')
  
# insert column using insert(position,column_name,
# first_column) function
result.insert(0, 'participant_id', first_column)
result['participant_id'] = result['participant_id']+'_ses-1_run-1'
#result

In [None]:
#standard.columns

In [None]:
result['session_id'] = result['session']
second_column = result.pop('session_id')
result.insert(1, 'session_id', second_column)
result['site'] = "TOP"
result['run_id'] = result['longitudinaltimepoint']
#result

In [None]:
#standard.columns

In [None]:
result['gm_icv_ratio'] = result['gm_icvratio']
result['gmwm_icv_ratio'] = result['gm_icvratio']
result['cbf_gm_pvc0'] = result['totalgm_b'] 
result['cbf_gm_pvc2']= 'NaN'
result['cbf_wm_pvc0']= result['deepwm_b']
result['cbf_wm_pvc2']= 'NaN'
result['cbf_aca_pvc0'] =  result['aca_b']
result['cbf_mca_pvc0']  = result['mca_b']
result['cbf_pca_pvc0'] =  result['pca_b']
result['cbf_aca_pvc2']  = 'NaN'
result['cbf_mca_pvc2']  = 'NaN'
result['cbf_pca_pvc2']  = 'NaN'
result['cov_gm_pvc0'] =  'NaN'
result['cov_gm_pvc2']  = 'NaN'
result['cov_wm_pvc0']  = 'NaN'
result['cov_wm_pvc2']  = 'NaN'
result['cov_aca_pvc0'] =  'NaN'
result['cov_mca_pvc0']  = 'NaN'
result['cov_pca_pvc0']  = 'NaN'
result['cov_aca_pvc2'] = 'NaN'
result['cov_mca_pvc2'] = 'NaN'
result['cov_pca_pvc2'] = 'NaN'


     

In [None]:
result.columns


In [None]:
# we added a bunch of columns 
len(result.columns)

In [None]:
# set_results= set(result.columns.to_list())

# z = set_results.intersection(set_standard) 
# new_result_columns = []
# for listable in z:
#     new_result_columns.append(listable)
# new_results = result[new_result_columns]
# new_results.columns

In [None]:
# standard.columns[:33]

In [None]:
# new_results = new_results[standard.columns[:33]]
# new_results

In [None]:
sex_mapping = {0:'F',1:'M',}
results = result.assign(sex = result.sex.map(sex_mapping))
results

## here need to reorder the columns again

In [None]:
standard.columns

In [None]:
results.columns

In [None]:
len(results.columns)

In [None]:
# shift columns to  position
third_column = results.pop('run_id')
fourth_column = results.pop('age')
fifth_column = results.pop('sex')
sixth_column = results.pop('site')
seventh_column = results.pop('gm_vol')
eight_column = results.pop('wm_vol')
ninth_column = results.pop('csf_vol')
tenth_column = results.pop('gm_icv_ratio')
eleventh_column = results.pop('gmwm_icv_ratio')
twelvth_column = results.pop('wmh_vol')
thirteenth_column = results.pop('wmh_count')

#last_column = results.pop('index')

results.insert(2, 'run_id', third_column)
results.insert(3, 'age', fourth_column)
results.insert(4, 'sex', fifth_column)
results.insert(5, 'site', sixth_column)
results.insert(6, 'gm_vol', seventh_column)
results.insert(7, 'wm_vol', eight_column)
results.insert(8, 'csf_vol', ninth_column)
results.insert(9, 'gm_ivc_ratio',tenth_column)
results.insert(10, 'gmwm_ivc_ratio',eleventh_column)
results.insert(11, 'wmh_vol',twelvth_column)
results.insert(12, 'wmh_count',thirteenth_column)
#results.insert(82, 'index', last_column)

results

In [None]:
results = results.drop(['index', 'subjectnlist','session','longitudinaltimepoint', 'tp','gmwm_icvratio','gm_icvratio'], axis=1) 

In [None]:
for column in results.columns:
    print(column, type(results[column][9]))

## now take last 20 columns, and put them after wmh count
not done yet

In [None]:
for column in results.columns[6:]:
    #print(column)
    results[column] = pd.to_numeric(results[column], errors = 'coerce')


In [None]:
results

In [None]:

def highlight_cols(s):
    color = 'red'
    return 'background-color: %s' % color




def find_outliers_by_list(dataframe, column_list):
    """
    This function finds the outliers in terms of anything outside two
    standard deviations
    from the mean on a list of specific specific column,
    then returns these rows of the dataframe.
    :param dataframe: whole dataframe on dataset
    :type dataframe: ~pandas.DataFrame
    :param column_list: list of relevant columns
    :type column_list: list
    :returns: dataframe of outliers
    :rtype: ~pandas.DataFrame
    """
    outlier_frames = []
    for column_n in column_list:
        mean = dataframe[column_n].mean()
        std = dataframe[column_n].std()
        values = dataframe[column_n].abs() - abs(mean + 2 * std)
        outliers = dataframe[values > 0]
        #outliers.style.applymap(highlight_cols, subset=pd.IndexSlice[:, [column_n]])
        #outliers.style.set_properties(subset=[column_n], **{'background-color': 'green'})
        outlier_frames.append(outliers)
        
    outlier_super = pd.concat(outlier_frames)
    return outlier_super


def check_sex_dimorph_expectations(dataframe):
    """
    This function checks that men
    as expected have larger brains than women
    in a given dataframe.
    """
    ladies = dataframe[dataframe['sex'] == 'F']
    men = dataframe[dataframe['sex'] == 'M']
    print('You have', len(ladies)/len(men), 'times as many ladies than men')
    if ladies.gm_vol.mean() < men.gm_vol.mean():
        print('As expected men have larger grey matter')
    if ladies.wm_vol.mean() < men.wm_vol.mean():
        print('As expected men have larger white matter')
    if ladies.gm_vol.mean() >= men.gm_vol.mean():
        print(
            'Caution, average female grey matter may be \
                  at similar or larger size than men'
        )
    if ladies.wm_vol.mean() >= men.wm_vol.mean():
        print(
            'Caution, average female white matter may be \
                  at similar or larger size than men'
        )
    if ladies.gm_vol.mean() >= men.gm_vol.mean() \
            or ladies.wm_vol.mean() >= men.wm_vol.mean():
        bad_data = dataframe
    else:
        bad_data = 0
    return bad_data

In [None]:
check_sex_dimorph_expectations(results)

In [None]:
len(results.columns.to_list())

In [None]:
# maybe examine graphs?
sep.relate_columns_graphs(results, 'age')

In [None]:
find_outliers_by_list(results, results.columns.to_list()[6:])

## Save off file

In [None]:
filepath = '../open_work/internal_results' 
filename = os.path.join(filepath,'top_stitched_conformed.csv') 
if not os.path.exists(filepath):
    # if filder doesn't exist, create it
    os.makedirs(filepath)
results.to_csv(filename)  