# Assembling the TOP dataset

In [None]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import seaborn
import glob

import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

## Read data into pandas dataframe

How do we define which files should be stitched together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, I will use the first option

In [None]:
# Identify files
experiment_folder = 'TOP'


config = Config()
root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files


In [None]:
# Read files into dataframes

dataframes = [pd.read_csv(file, sep='\t', header=[0,1], index_col=0) for file in tsv_files]
sample_df = dataframes[0]
cols = sample_df.columns
sample_df


## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [None]:
# new method
## will be recoded with function call from main branch

def check_identical_columns(tsv_path):
    """
    Here we enter the path to a folder, then return which columns which in
    all files are exactly duplicated.In name and values
    """
    tsv_files = glob.glob(os.path.join(tsv_path, '*.tsv'))
    dataframes = [
        pd.read_csv(file, sep='\t', header=[0, 1], index_col=0)
        for file in tsv_files
    ]
    key_df, *rest_dfs = dataframes

    shared_columns = set(key_df.columns)

    for frame in rest_dfs:
        # check which labels are shared
        shared_columns = shared_columns.intersection(frame.columns)

    result = []
    for column in shared_columns:
        for frame in rest_dfs:
            if not frame[column].equals(key_df[column]):
                break
        else:
            result.append(column)
    return result

identical_columns = check_identical_columns(tsv_path)

In [None]:
identical_columns

In [None]:
stitched = sample_df[identical_columns].copy()

In [None]:
n_identical = stitched.shape[1]

In [None]:
n_identical

In [None]:
# stitched = sample_df[cols[:n_identical]].copy()
# stitched


In [None]:
stitched = sample_df[identical_columns].copy()
stitched

### Note these notebooks assume identical columns are all adjacent.

## Add unique columns from files

In [None]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched

### check that people only did one session per visit

In [None]:
stitched[(              'session',                 '...')].unique()

## Add sex and age data

In [None]:
stitched = stitched.reset_index()

In [None]:
stitched = sep.concat_double_header(stitched)
stitched['ID']  = stitched['index_']

In [None]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_Sex_TOP.csv')
sexage_df = pd.read_csv(sexage_path)
sexage_df

In [None]:
sexage_df.columns

In [None]:
result = pd.merge(sexage_df,stitched, on='ID')

In [None]:
# for col in sexage_df[2:]:
#     stitched[col] = sexage_df[col]

In [None]:
result

In [None]:
result['session_...'].unique()

In [None]:
# we got lucky, they all had one session. we check time points as well
result['LongitudinalTimePoint_integer'].unique()

## save off file

In [None]:
 
filepath = '../open_work/internal_results/top_stitched.csv' 
result.to_csv(filepath)  