# Assembling the Inight 46 dataset

### Import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
import glob
import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config


## Read data into pandas dataframe

How do we define which files should be used together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, we will will use the first option, approved by Mathijs on 26th June 2023

In [None]:
experiment_folder= 'Insight46'

In [None]:
config = Config()
root_directory = config.get_directory('raw_data')
if os.path.isdir(os.path.join(root_directory, experiment_folder)):
    print("this folder exists, we will take tsv from here")
else: 
    print("this folder does not seem to exist, try typing again")

In [None]:
root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files

# Add a step to get rid of second visits #TODO

In [None]:
# Read files into dataframes
dataframes = [pd.read_csv(file, sep='\t', header=[0,1], index_col=0) for file in tsv_files]
sample_df = dataframes[0]
cols = sample_df.columns
sample_df

## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [None]:
# new method
## will be recoded with function call from main branch

def check_identical_columns(tsv_path):
    """
    Here we enter the path to a folder, then return which columns which in
    all files are exactly duplicated.In name and values
    """
    tsv_files = glob.glob(os.path.join(tsv_path, '*.tsv'))
    dataframes = [
        pd.read_csv(file, sep='\t', header=[0, 1], index_col=0)
        for file in tsv_files
    ]
    key_df, *rest_dfs = dataframes

    shared_columns = set(key_df.columns)

    for frame in rest_dfs:
        # check which labels are shared
        shared_columns = shared_columns.intersection(frame.columns)

    result = []
    for column in shared_columns:
        for frame in rest_dfs:
            if not frame[column].equals(key_df[column]):
                break
        else:
            result.append(column)
    return result

identical_columns = check_identical_columns(tsv_path)

In [None]:
identical_columns

In [None]:
stitched = sample_df[identical_columns].copy()

In [None]:
n_identical = stitched.shape[1]

In [None]:
stitched['renumber'] = stitched.index

In [None]:
#stitched.shape

### Here is where we add the different parts to stitched

In [None]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched.columns

# Here we note that we have 391 columns...a hand check shows this is possible. Lots pf brain areas left, right and both. But needs discussion with scientists

## Here we can should get rid of second visits, but what we see is that session was not in the common columns. We will get rid of all second time points, and people ending in _2. And mention to scientists

In [None]:
#stitched

In [None]:
stitched = stitched.loc[stitched[('LongitudinalTimePoint',           'integer')] == 'TimePoint_1']

In [None]:
stitched[(              'session',               '...')].unique()

In [None]:
## In this dataset e see that there were no double session. We got lucky.

In [None]:
stitched.columns

In [None]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_sex_Insight46.csv')
sexage_df = pd.read_csv(sexage_path, index_col=0)
sexage_df['renumber'] = sexage_df.index
sexage_df['renumber'] = sexage_df['renumber'].apply(str)

#sexage_df['renumber']
# tp 2 then + '2_1' if 1 then 1_1
sexage_df

## Here we take the pattient ID and align it with our other frame's index by putting sub and _1

In [None]:
sexage_df['renumber'] = "sub_" +sexage_df['renumber'] + "_1"
sexage_df

In [None]:
sexage_df =  sep.recode_sex(sexage_df, 'sex')
sexage_df['sex'] = sexage_df['sex_encoded']
sexage_df

In [None]:
#len(sexage_df.renumber.unique())

In [None]:
#len(sexage_df.renumber)

# In patient brain info framepatient ID, we may be looking at _1 or _2. But the data contained only one sexage row per patient. So we only deal with first visits Insight46

In [None]:
stitched = stitched.reset_index(drop=True)
sexage_df = sexage_df.reset_index(drop=True)
result = pd.concat([stitched, sexage_df], axis=1, join="inner")
result

In [None]:
result.columns = [c[0]  for c in result.columns]
result.columns

In [None]:
for n in result.columns:
    print(n)

In [None]:
result = result.rename(columns={"a": "Age", "s": "Sex"})


In [None]:
result = result.loc[:,~result.columns.duplicated()].copy()

## Save off file

In [None]:
result.columns

In [None]:

filepath = '../open_work/internal_results' 
filename = os.path.join(filepath,'Insight46_stitched.csv') 
if not os.path.exists(filepath):
    # if filder doesn't exist, create it
    os.makedirs(filepath)
result.to_csv(filename)  

In [None]:
result

In [None]:
result.LongitudinalTimePoint.unique()