# Exploring TSV files`

### Imports

In [None]:
import os       # using operating system dependent functionality (folders)
import glob
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import copy     # Can Copy and Deepcopy files so original file is untouched.
from ipywidgets import IntSlider, Output
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.insert(0, '../') # path to functions
from cvasl import file_handler as fh # 
from cvasl import mold #
from cvasl import carve
from cvasl.file_handler import Config

### Configure data

In [None]:
config = Config.from_file()
root_mri_directory = config.get_directory('raw_data')

### Load tsv files

In [None]:
tsv_pattern = os.path.join(root_mri_directory, '**/*.tsv')
tsv_files = glob.glob(tsv_pattern, recursive=True)


### check tsv files

In [None]:
root_mri_directory

In [None]:
tsv_files

In [None]:
dataframe_example = pd.read_csv(tsv_files[0], sep='\t')

In [None]:
dataframe_example

In [None]:
dataframe_example.columns

In [None]:
## check tsv file diversity
tsv_files

### Analysis

Without the subject ages we can not do an analysis on anything except how subjects progress over time points, and how various parameters predict each other. But let's scan all the tev we were given to see if we have ones with age. 

#### Correlations within datasets:

Let's use one dataframe example:

In [None]:
dataframe_example.columns

In [None]:
dataframe_example_nums_only = dataframe_example[[ 
    'SubjectNList',
    'Site', 'GM_vol', 'WM_vol', 'CSF_vol', 'GM_ICVRatio', 'GMWM_ICVRatio',
    'WMH_vol', 'WMH_count', 'MeanMotion', ]]

In [None]:
dataframe_example_nums_only

In [None]:
dataframe_example_nums_only[1:].corr()

In [None]:
%matplotlib inline
sns.heatmap(dataframe_example_nums_only[1:].corr(), annot = True)

So we do see some correlations int his particular dataset that are strong an expected. The area of brain to ICV ratio negatively correlates with the CSF volume as should be expected. White matter and grey matter correlate pretty well. White matter hyperintensities in count correlate somewhat with white matter hyperintensity volume. And deep WM-L correlates between the sides of the brain and both. Basically everything we would expect. 

So now we can make a super-dataset of all the datasets, and see if these correlations hold. 

In [None]:
dataframe_example2 = pd.read_csv(tsv_files[1], sep='\t')
dataframe_example2.columns.to_list()

More elements than first...let's see what we have in common between the two tsv:

In [None]:
first = set(dataframe_example.columns.to_list())
second = set(dataframe_example2.columns.to_list())

In [None]:
not_common2 =  list(set(dataframe_example2.columns.to_list()) - set(dataframe_example.columns.to_list()))

In [None]:
shared = list(first.intersection(second))
shared

Not a lot...let's look at what we have in common in all or most of the tsv

In [None]:
name_file = []
longlesses = []
for file in tsv_files:
    dataframe_example = pd.read_csv(file, sep='\t')
    longness = len(dataframe_example.columns)
    name_file.append(file)
    longlesses.append(longness)
data_tsv = pd.DataFrame([name_file, longlesses])        
print(longlesses)        

In [None]:
data_tsv

So There may be 15 common features on most as a guess. We need to not look at the last on the list.

In [None]:
#print(name_file[:-1])

In [None]:
set_of_relevant_files = name_file[:-1]

In [None]:
name_file = []
longlesses = []
intersections = []
len_intersections = []
for file in set_of_relevant_files:
    dataframe_example = pd.read_csv(file, sep='\t')
    longness = len(dataframe_example.columns)
    name_file.append(file)
    longlesses.append(longness)
    dataframe_example2 = pd.read_csv(file, sep='\t')
    columns = dataframe_example2.columns.to_list()
    intersection = set(columns).intersection(second)
    intersections.append(intersection)
    len_intersections.append(len(intersection))
data_tsv = pd.DataFrame([name_file, longlesses, len_intersections, intersections])         

In [None]:
data_tsv

So we will have twelve or thireen common elements we can compare.Let's look at hope they are about the same.

In [None]:
data_tsv[0][3]

In [None]:
# Below cells no longer relevant, experiment

In [None]:
#data_tsv[2][3]

In [None]:
#data_tsv[4][3]

In [None]:
#data_tsv[4][3].intersection(data_tsv[2][3]).intersection(data_tsv[0][3]).intersection(data_tsv[5][3]).intersection(data_tsv[3][3]).intersection(data_tsv[6][3]).intersection(data_tsv[7][3])

OK, so more or less we should have the above values in every group of tsv in our supergroup.
Let's check if we do

In [None]:
#list_elements = data_tsv[4][3].intersection(data_tsv[2][3]).intersection(data_tsv[0][3]).intersection(data_tsv[5][3]).intersection(data_tsv[3][3]).intersection(data_tsv[6][3]).intersection(data_tsv[7][3])

In [None]:
#list_elements

In [None]:
# name_file = []
# longlesses = []
# good_files = []
# for file in tsv_files:
#     dataframe_example = pd.read_csv(file, sep='\t')
#     if set(list_elements).issubset(set(dataframe_example.columns.to_list())):
#                                    good_files.append(file)
       
# print(good_files)        

In [None]:
#len(good_files)

Here we must say in this set, the set of common elements is about:
CSV_vol,  'GMWM_ICVRatio',
 'GM_ICVRatio',
 'GM_vol',
 'LongitudinalTimePoint',
 'MeanMotion',
 'Site',
 'SubjectNList',
 'WMH_count',
 'WMH_vol',
 'WM_vol',
 'participant_id',
 'session'
 
 However we need something to extract the common set from any group of tsv columns

In [None]:
# def make_columns(list_tsv_files):
#     columns_list = []
#     for file in list_tsv_files:
#         dataframe_example = pd.read_csv(file, sep='\t')
#         columns= dataframe_example.columns.to_list()
#         columns_list.append(columns)
#     return columns_list

# a = fh.make_columns(good_files)

In [None]:

# def intersect_all(*sets):
#     result, *rest = sets
#     for remaining in rest:
#         result = set(result).intersection(remaining)
#     return result

# good_columns_sets = fh.intersect_all(*a)

Now we make our super tsv file

In [None]:
# good_columns_list = list(good_columns_sets)
# good_columns_list

In [None]:
# tabs_together = []
# for file in good_files:
#     print(file)
#     tabular = pd.read_csv(file, sep='\t')
#     tabularnow = tabular[good_columns_list]
#     tabs_together.append(tabularnow)

In [None]:
# tabs_together

now if tsvs were different we could stack 8 elements of tabular ...and make a supercomparator, but we seem tohave the same tsv over and over because the first ten with thesame columns are the same...wierd check wth scientists..looking at the names they all came from the same day. let's check all dates

In [None]:
# more_files = []
# for file in tsv_files:
#     dataframe_example = pd.read_csv(file, sep='\t')
#     columns = dataframe_example.columns.to_list()
#     if len(columns) > 5:
#         more_files.append(file)
# b = fh.make_columns(more_files)       

In [None]:
# newer_columns_sets = fh.intersect_all(*b)

In [None]:
def extract_common_columns(list_tsv_files):
    b = fh.make_columns(list_tsv_files)
    columns_sets = fh.intersect_all(*b)
    return columns_sets
    

In [None]:
# fh.extract_common_columns(more_files)

In [None]:
#newer_columns_sets_list = list(newer_columns_sets)

In [None]:
# tabs_together = []
# for file in more_files:
#     print(file)
#     tabular = pd.read_csv(file, sep='\t')
#     tabularnow = tabular[newer_columns_sets_list]
#     tabs_together.append(tabularnow)

In [None]:
#tabs_together

So we have three kinds, many times duplicated over- must dicuss with scientists. UNtil then let's reduce and combine

In [None]:
#tabs_together[0]

In [None]:
#len(tabs_together)

In [None]:
#tabs_together[0].equals(tabs_together[10])

In [None]:
def unduplicate_dataframes(list_of_dataframes):
    duplicates = []
    core = []
    for frame,next_frame in zip(list_of_dataframes, list_of_dataframes[1:]):
        if frame.equals(next_frame):
            duplicates.append(frame)
        else:
            core.append(frame)
    core.append(list_of_dataframes[0])
    return core
        
    

In [None]:
def unduplicate_dfs(list_of_dataframes):
    """
    This function takes a list of dataframes
    and should return only dataframes that are not duplicated from each other
    but it must be improved (see TODO)
    """
    # TODO: change to a rotating version so it picks off any duplicates
    core = []
    for frame,next_frame in zip(list_of_dataframes, list_of_dataframes[1:]):
        if frame.equals(next_frame) == False:
            core.append(frame)
    core.append(list_of_dataframes[0])
    return core
        

In [None]:
#properly_different_dataframes = fh.unduplicate_dfs(tabs_together)

In [None]:
#check =  unduplicate_dfs(tabs_together)

In [None]:
#check

In [None]:
# bad_lines_out = []
# for frame in properly_different_dataframes:
#     frame = frame[1:]
#     bad_lines_out.append(frame)

In [None]:
# result = pd.concat(bad_lines_out)

In [None]:
# result

In [None]:
# result_no_str= result.drop('LongitudinalTimePoint', axis=1)

In [None]:
# %matplotlib inline
# sns.heatmap(result_no_str.corr(), annot = True)

So we see a good correlation between grey matter and white matter volumes, and therefore unsurprisingle a good correlation on GMWM-ICVratio and GM_ICV ratio. We also see a great negative correation between CSF volume and GMWM-ICV (also also GM)ICV). These things show our datasets seems to be reflecting expected reality.
The next step is to correlate with age.