# Exploring TSV files`

### Imports

In [None]:
import os       # using operating system dependent functionality (folders)
import glob
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
# import copy     # Can Copy and Deepcopy files so original file is untouched.
# from ipywidgets import IntSlider, Output
import ipywidgets as widgets
# from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.insert(0, '../') # path to functions
from cvasl import file_handler as fh # 
from cvasl import mold #
from cvasl import carve
from cvasl.file_handler import Config

### Configure data

In [None]:
config = Config.from_file()
root_mri_directory = config.get_directory('raw_data')

### Load tsv files

In [None]:
tsv_pattern = os.path.join(root_mri_directory, '**/*.tsv')
tsv_files = glob.glob(tsv_pattern, recursive=True)

### check tsv files 
Optional commented out

In [None]:
dataframe_example = pd.read_csv(tsv_files[0], sep='\t', header=[0,1])

In [None]:
dataframe_example

In [None]:
dataframe_example.columns

In [None]:
## check tsv file diversity
#tsv_files

### Adding age to analysis

Now we want to take the csv file with age and add it to the analysis.
For now we will hard-code the path, until we decide whether this should be part of the initialized setup on the config file

In [None]:
age_csv_place = "C:/Projects/brainspin/age_data"
age_csv_pattern = os.path.join(age_csv_place, '**/*.csv')
age_csv_files = glob.glob(age_csv_pattern, recursive=True)
age_dataframe_example1 = pd.read_csv(age_csv_files[0])
age_dataframe_example2 = pd.read_csv(age_csv_files[1])
age_dataframe_example3 = pd.read_csv(age_csv_files[2])

In [None]:
# superset of ages....must normalize column names and values
age_dataframe_example1 = age_dataframe_example1.rename(columns={"ageatscandate_i46p1": "age","participant ID": "participant_id"})
age_dataframe_example2 = age_dataframe_example2.rename(columns={"Age": "age","ID": "participant_id", "Sex":"sex"})
age_dataframe_example3 = age_dataframe_example3.rename(columns={"Age": "age","ID": "participant_id", "Sex":"sex"})
age_dataframe_example3 = age_dataframe_example3.drop("TP", axis=1)

In [None]:
# stack on top of each otehr
frames = [age_dataframe_example1, age_dataframe_example2, age_dataframe_example3]
super_age_set = pd.concat(frames)
super_age_set

### Analysis

Without the subject ages we can not do an analysis on anything except how subjects progress over time points, and how various parameters predict each other. But let's scan all the tev we were given to see if we have ones with age. 

#### Correlations within datasets:

So now we can make a super-dataset of all the datasets, and see if these correlations hold. 

In [None]:
dataframe_example2 = pd.read_csv(tsv_files[1], sep='\t')
dataframe_example2.columns.to_list()

More elements than first...let's see what we have in common between the two tsv:

In [None]:
first = set(dataframe_example.columns.to_list())
second = set(dataframe_example2.columns.to_list())

In [None]:
# not_common2 =  list(set(dataframe_example2.columns.to_list()) - set(dataframe_example.columns.to_list()))

In [None]:
# shared = list(first.intersection(second))
# shared

Not a lot...let's look at what we have in common in all or most of the tsv

In [None]:
name_file = []
longlesses = []
for file in tsv_files:
    dataframe_example = pd.read_csv(file, sep='\t')
    longness = len(dataframe_example.columns)
    name_file.append(file)
    longlesses.append(longness)
data_tsv = pd.DataFrame([name_file, longlesses])        
print(longlesses)        

In [None]:
data_tsv

So There may be 15 common features on most as a guess. We need to not look at the last on the list.

In [None]:
#print(name_file[:-1])

In [None]:
set_of_relevant_files = name_file[:-1]

In [None]:
name_file = []
longlesses = []
intersections = []
len_intersections = []
for file in set_of_relevant_files:
    dataframe_example = pd.read_csv(file, sep='\t')
    longness = len(dataframe_example.columns)
    name_file.append(file)
    longlesses.append(longness)
    dataframe_example2 = pd.read_csv(file, sep='\t')
    columns = dataframe_example2.columns.to_list()
    intersection = set(columns).intersection(second)
    intersections.append(intersection)
    len_intersections.append(len(intersection))
data_tsv = pd.DataFrame([name_file, longlesses, len_intersections, intersections])         

In [None]:
data_tsv

So we will have twelve or thireen common elements we can compare.Let's look at hope they are about the same.

In [None]:
# data_tsv[0][3]

In [None]:
#data_tsv[2][3]

In [None]:
#data_tsv[4][3]

In [None]:
#data_tsv[4][3].intersection(data_tsv[2][3]).intersection(data_tsv[0][3]).intersection(data_tsv[5][3]).intersection(data_tsv[3][3]).intersection(data_tsv[6][3]).intersection(data_tsv[7][3])

OK, so more or less we should have the above values in every group of tsv in our supergroup.
Let's check if we do

In [None]:
list_elements = data_tsv[4][3].intersection(data_tsv[2][3]).intersection(data_tsv[0][3]).intersection(data_tsv[5][3]).intersection(data_tsv[3][3]).intersection(data_tsv[6][3]).intersection(data_tsv[7][3])

In [None]:
#list_elements

In [None]:
name_file = []
longlesses = []
good_files = []
for file in tsv_files:
    dataframe_example = pd.read_csv(file, sep='\t')
    if set(list_elements).issubset(set(dataframe_example.columns.to_list())):
                                   good_files.append(file)
# print(good_files)        

In [None]:
len(good_files)

Here we must say in this set, the set of common elements is about:
CSV_vol,  'GMWM_ICVRatio',
 'GM_ICVRatio',
 'GM_vol',
 'LongitudinalTimePoint',
 'MeanMotion',
 'Site',
 'SubjectNList',
 'WMH_count',
 'WMH_vol',
 'WM_vol',
 'participant_id',
 'session'
 
 However we need something to extract the common set from any group of tsv columns

In [None]:
a = fh.extract_common_columns(good_files)

In [None]:
good_columns_sets = fh.intersect_all(*a)

Now we make our super tsv file

In [None]:
good_columns_list = list(a)
good_columns_list

Here we see that every file should have participant_id. 

In [None]:
tabs_together = []
for file in good_files:
    tabular = pd.read_csv(file, sep='\t',  header=[0,1])
    tabularnow = tabular[good_columns_list]
    tabs_together.append(tabularnow)

In [None]:
len(tabs_together)

now if tsvs were different we could stack 8 elements of tabular ...and make a supercomparator, but we seem tohave the same tsv over and over because the first ten with thesame columns are the same...wierd check wth scientists..looking at the names they all came from the same day. let's check all dates

So we have three kinds, many times duplicated over- must dicuss with scientists. UNtil then let's reduce and combine

In [None]:
tabs_together[0].columns

In [None]:
len(tabs_together)

In [None]:
properly_different_dataframes = fh.unduplicate_dfs(tabs_together)

In [None]:
# result = result.droplevel(1, axis=1)
len(properly_different_dataframes)

In [None]:
for file in properly_different_dataframes:
    file = file.droplevel(1, axis=1)
file    

In [None]:
result_no_str= file.drop('LongitudinalTimePoint', axis=1)
result_no_str

In [None]:
super_age_set = super_age_set.drop("TP", axis= 1)
super_age_set

In [None]:
# Here merge with ages
final_df = pd.merge(result_no_str,super_age_set)

In [None]:
final_df

In [None]:
%matplotlib inline
sns.heatmap(final_df.corr(), annot = True)

So we see a good correlation between grey matter and white matter volumes, and therefore unsurprisingle a good correlation on GMWM-ICVratio and GM_ICV ratio. We also see a great negative correation between CSF volume and GMWM-ICV (also also GM)ICV). These things show our datasets seems to be reflecting expected reality.
One next next step is to correlate with age.

But let's keep seeing how we can look for things in our tsv.
Below is an example

In [None]:
fh.find_where_column(tsv_files, ['CSF_vol', 'WM_vol'])

Now a coding scientist can continue by finding sets of tsv for specific quesitons