# Assembling the Inight 46 dataset

### Import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config


## Read data into pandas dataframe

How do we define which files should be used together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, we will will use the first option

In [None]:
experiment_folder= 'Insight46'

In [None]:
config = Config()
root_directory = config.get_directory('raw_data')
if os.path.isdir(os.path.join(root_directory, experiment_folder)):
    print("this folder exists, we will take tsv from here")
else: 
    print("this folder does not seem to exist, try typing again")

In [None]:
root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files

In [None]:
# Read files into dataframes
dataframes = [pd.read_csv(file, sep='\t', header=[0,1], index_col=0) for file in tsv_files]
sample_df = dataframes[0]
cols = sample_df.columns
sample_df

## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [None]:
# how many identical columns are there in the files?
n_identical = 11  # columns A - L

stitched = sample_df[cols[:n_identical]].copy()
stitched['renumber'] = stitched.index

# Caution, scientists need to confirm the above steps are legitamate, until there is a check that these are the columns that are identical.

In [None]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched.columns

# Here we note that we have 391 columns...a hand check shows this is possible. Lots pf brain areas left, right and both. But needs discussion with scientists

In [None]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_sex_Insight46.csv')
sexage_df = pd.read_csv(sexage_path, index_col=0)
sexage_df['renumber'] = sexage_df.index
sexage_df['renumber'] = sexage_df['renumber'].apply(str)

#sexage_df['renumber']
# tp 2 then + '2_1' if 1 then 1_1
sexage_df

# Here we assume that we only have time point 1 on this dataset. This must be checked with scientists

In [None]:
sexage_df['renumber'] = "sub_" +sexage_df['renumber'] + "_1"
sexage_df

In [None]:
stitched = stitched.reset_index(drop=True)
sexage_df = sexage_df.reset_index(drop=True)
result = pd.concat([stitched, sexage_df], axis=1, join="inner")
result

In [None]:
result.columns = [c[0]  for c in result.columns]
result.columns

In [None]:
for n in result.columns:
    print(n)

In [None]:
result = result.rename(columns={"a": "Age", "s": "Sex"})


## Save off file

In [None]:
filepath = '../open_work/internal_results/Insight46_stitched.csv' 
result.to_csv(filepath)  

In [None]:
result.columns