# Assembling the StrokeMRI dataset

In [None]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config


## Read data into pandas dataframe

How do we define which files should be stitched together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, we will will use the first option

In [None]:
experiment_folder= 'StrokeMRI'

In [None]:
config = Config()
root_directory = config.get_directory('raw_data')
if os.path.isdir(os.path.join(root_directory, experiment_folder)):
    print("this folder exists, we will take tsv from here")
else: 
    print("this folder does not seem to exist, try typing again")

In [None]:
root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files

In [None]:
# Read files into dataframes
dataframes = [pd.read_csv(file, sep='\t', header=[0,1], index_col=0) for file in tsv_files]
sample_df = dataframes[0]
cols = sample_df.columns
sample_df

## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [None]:
# how many identical columns are there in the files?
n_identical = 11  # columns A - L

stitched = sample_df[cols[:n_identical]].copy()
stitched['renumber'] = stitched.index


# Caution, scientists need to confirm the above steps are legitamate, until there is a check that these are the columns that are identical.

In [None]:
# Would be nice to add a test here to double check that identical columns are actually indentical

## Add unique columns from files

In [None]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched.columns

## Add sex and age data

In [None]:
#sexage_df


In [None]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_Sex_StrokeMRI.csv')
sexage_df = pd.read_csv(sexage_path, index_col=0)
sexage_df['renumber'] = sexage_df.index
sexage_df['renumber'] = sexage_df['renumber'].apply(str)

#sexage_df['renumber']
# tp 2 then + '2_1' if 1 then 1_1

In [None]:
sexage_df.loc[sexage_df.TP == 1, 'add_column'] = "01_1"
sexage_df.loc[sexage_df.TP == 2, 'add_column'] = "02_1"
sexage_df['renumber'] = sexage_df['renumber'] + sexage_df['add_column']
sexage_df

Now we need to reformat the participant ID

In [None]:
stitched = stitched.reset_index(drop=True)
sexage_df = sexage_df.reset_index(drop=True)
result = pd.concat([stitched, sexage_df], axis=1, join="inner")
result

In [None]:
# for col in sexage_df:
#     stitched[col] = sexage_df[col]

result.columns = [c[0]  for c in result.columns]
result.columns

## save off file

In [None]:
 
filepath = '../open_work/internal_results/StrokeMRI_stitched.csv' 
result.to_csv(filepath)  

# Look at columns

In [None]:
result.columns

In [None]:
def concat_double_header(dataframe_dub):
    dataframe = dataframe_dub.copy()
    dataframe.columns = [c[0] + "_" + c[1] for c in dataframe.columns]
    return dataframe

In [None]:
lo = concat_double_header(stitched)
lo.columns

In [None]:
sep.relate_columns_graphs(dataframe, 'Age_')

In [None]:
topper =pd.read_csv('../open_work/internal_results/top_stitched.csv')
topper