# Assembling the TOP dataset

### Import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
import glob
import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

## Read data into pandas dataframe

How do we define which files should be used together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, we will will use the first option, approved by Mathijs on 26th June 2023

In [None]:
experiment_folder= 'TOP'

In [None]:
config = Config()
root_directory = config.get_directory('raw_data')
if os.path.isdir(os.path.join(root_directory, experiment_folder)):
    print("this folder exists, we will take tsv from here")
else: 
    print("this folder does not seem to exist, try typing again")

In [None]:
root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files

# Add a step to get rid of second visits #TODO

In [None]:
# Read files into dataframes
dataframes = [pd.read_csv(file, sep='\t', header=[0]) for file in tsv_files]
sample_df = dataframes[2]
cols = sample_df.columns
sample_df

In [None]:
numbr = 0
for frame in dataframes:
    filepath = '../open_work/internal_results/stitchy' 
    filename = os.path.join(filepath,str(numbr+1)) 
    if not os.path.exists(filepath):
    # if filder doesn't exist, create it
        os.makedirs(filepath)
    frame.to_csv((filename +'.tsv'), sep="\t")
    numbr +=1

## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [None]:
tsv_folder_made = '../open_work/internal_results/stitchy'
identical_columns = sep.check_identical_columns(tsv_folder_made)

In [None]:
identical_columns

In [None]:
sample_df.columns

In [None]:
#identical_columns

In [None]:
stitched = sample_df[identical_columns].copy()

In [None]:
n_identical = stitched.shape[1]

In [None]:
#stitched['renumber'] = stitched.index

In [None]:
#stitched.shape

In [None]:
#Here we need to add back in patient ID

### Here is where we add the different parts to stitched

In [None]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched.columns

## Here we can should get rid of second visits, but what we see is that session was not in the common columns. We will get rid of all second time points, and people ending in _2. And mention to scientists

In [None]:
#stitched

In [None]:
#stitched = stitched.loc[stitched[('LongitudinalTimePoint',           'integer')] == 'TimePoint_1']

In [None]:
stitched

In [None]:
stitched.columns

In [None]:
stitched = stitched[1:]
#stitched

In [None]:
stitched['session'].unique()

In [None]:
#stitched.columns

In [None]:
stitched['LongitudinalTimePoint'].unique()

## So in this dataset we have one longitudinal timepoint, and one type of session. no need to filter down away from this

In [None]:
stitched.columns

In [None]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_sex_TOP.csv')
sexage_df = pd.read_csv(sexage_path, index_col=0)
sexage_df['renumber'] = sexage_df.index
sexage_df['renumber'] = sexage_df['renumber'].apply(str)

sexage_df

## Here we take the patient ID and align it with our other frame's index

In [None]:
stitched = stitched.reset_index(drop=False)
stitched

In [None]:
#stitched = sep.concat_double_header(stitched)

In [None]:
stitched.renumber

In [None]:
#print(type(stitched['renumber'][9]))

In [None]:
stitched['renumber'] = stitched['renumber'].astype('string')

In [None]:
stitched['index']

In [None]:
sexage_df = sexage_df.reset_index(drop=True)
sexage_df

In [None]:
stitched.columns

In [None]:
result = stitched.merge(sexage_df, on="renumber")
result

In [None]:
result

In [None]:
result

In [None]:
result.columns

In [None]:
result = result.loc[:,~result.columns.duplicated()].copy()

## Conform file to new standard

### example of new standard (from M.D. on 23/08/2023)

In [None]:
standard_folder = 'standard' 

In [None]:
standard_path = os.path.join(root_directory, standard_folder,'participants_CBA.csv')
#standard_path_file = standard_path

In [None]:
standard = pd.read_csv(standard_path)
standard

In [None]:
result.columns = result.columns.str.lower()
result

## Save off file

In [None]:
filepath = '../open_work/internal_results' 
filename = os.path.join(filepath,'top_stitched_conformed.csv') 
if not os.path.exists(filepath):
    # if filder doesn't exist, create it
    os.makedirs(filepath)
result.to_csv(filename)  