# Divide the scan into different brain sections 

In [2]:
import os
import numpy as np
current_dir = os.getcwd()

# let's see how it goes
npy_file_path = os.path.join(current_dir, 'ihb.npy')

all_data = np.load(npy_file_path)

In [3]:
all_data.shape

(320, 10, 246)

In [4]:
import pandas as pd
num_nans = [np.isnan(all_data[i]).sum() for i in range(all_data.shape[0])]
nan_values = pd.Series(num_nans)
nan_values.value_counts()

0      160
460    160
Name: count, dtype: int64

In [5]:
g1 = [all_data[i] for i in range(all_data.shape[0]) if np.isnan(all_data[i]).sum() == 0]
g2 = [all_data[i] for i in range(all_data.shape[0]) if np.isnan(all_data[i]).sum() == 460]

g2_nan_distribution = np.zeros(shape=(len(g2), len(g2)))

for i1, element1 in enumerate(g2):
	for i2, element2 in enumerate(g2):
		g2_nan_distribution[i1][i2] = all(np.sum(np.isnan(element1), axis=1) == np.sum(np.isnan(element2), axis=1))

g2 = [scan[:, :-46] for scan in g2]

# The first step: group scans of the same scan but with different smoothing

In [6]:
# we know for a fact that we have 160 samples belonging to 20 subjects each having

# let's calculate the auto correlation of each time sequence in each of scan
from typing import Union, List
from scipy.signal import correlate2d, correlate

def autocorrelation_stats(scan: np.ndarray, aggregate:bool=True) -> Union[List, float]:	
	assert len(scan) == 10
	auto_correlations =  [float(correlate(scan[i:], scan[:-i])) for i in range(1, 6)]
	if aggregate:
		return np.mean(auto_correlations)
	return auto_correlations

def build_ac_pairs(scans: List[np.ndarray]) -> set:
	auto_corrs = np.zeros(shape=(len(scans), len(scans)))

	for i1, element1 in enumerate(scans):
		for i2, element2 in enumerate(scans):
			auto_corrs[i1][i2] = correlate2d(element1, element2, "valid").item()	
	
	# for each row, row[0] represents the closest index to scan[i] in terms of auto correlation
	# row[1] represents the same index
	paired_scans_by_ac = np.argsort(auto_corrs, axis=-1)[:, -2:]

	pairs = set()

	for i in range(len(scans)):
		assert paired_scans_by_ac[i, 1] == i, "check the code"
		closest_scan_index = paired_scans_by_ac[i, 0]
		if paired_scans_by_ac[closest_scan_index, 0] == i and (closest_scan_index, i) not in pairs:
			pairs.add((i, closest_scan_index)) 

	return pairs

In [7]:
g1_pairs, g2_pairs = build_ac_pairs(g1), build_ac_pairs(g2)

In [8]:
len(g1_pairs), len(g2_pairs)

(80, 80)

# Next step: grouping the different segments of the same scan sequence

In [9]:
def unified_segment_rep(scans: List[np.ndarray], pairs_indices: set) -> List[np.ndarray]:
	avg_segments = []
	for i1, i2 in pairs_indices:
		s1, s2  = scans[i1], scans[i2]
		if s1.shape != s2.shape:
			raise ValueError("Make sure the code is correct. found pairs with different shapes")
		avg_segments.append((s1 + s2) / 2)
	return avg_segments

In [10]:
avg_g1, avg_g2 = unified_segment_rep(g1, g1_pairs), unified_segment_rep(g2, g2_pairs)

In [11]:
# let's assume that the different scan segments correspond to consecutive time stamps (a far-stretched assumption, but why not ?)
# the whole idea here is that if we have two consecutive  segments s1 and s2 from an original sequence "S". Then we can see if the auto correlation between [s1, s2] and s1, s2 and [s2, s1], s1 and s2 
# for each sequence to "s1" to find the best sequence "s2", we need "n" operations with a total of n^2: pretty much nothing when (n = 80)

In [19]:
def find_best_next_segment(seg_index: int, segments: List[np.ndarray]):
	max_ac_corr = -float('inf')
	best_index = None
	best_order = None

	for i in range(len(segments)):
		if i == seg_index: 
			continue

		# # compound segment
		other_seg = segments[i]
		# # build the bigger sequence
		compound_seg1 = np.concatenate([segments[seg_index], other_seg], axis=0)
		compound_seg2 = np.concatenate([other_seg, segments[seg_index]], axis=0)

		seg_index_compound = np.concatenate([segments[seg_index], segments[seg_index]], axis=0)
		other_seg_compound = np.concatenate([other_seg, other_seg], axis=0)

		c1 = correlate2d(compound_seg1, seg_index_compound, "valid").item()
		c2 = correlate2d(compound_seg2, other_seg_compound, "valid").item()

		# c1 = correlate2d(compound_seg1, compound_seg1, "valid")
		# c2 = correlate2d(compound_seg2, compound_seg2, "valid")
		corr = max(c1, c2)

		if corr > max_ac_corr:
			max_ac_corr = corr 
			best_index = i
			best_order = [seg_index, i] if c1 > c2 else [i, seg_index]

	return best_index, best_order 

In [20]:
# s1, s2, s3 = np.concatenate([avg_g1[0], avg_g1[1]], axis=0),np.concatenate([avg_g1[1], avg_g1[2]], axis=0),np.concatenate([avg_g1[0], avg_g1[2]], axis=0)
# correlate2d(s1, s1, "valid").item(),correlate2d(s2, s2, "valid").item(),correlate2d(s3, s3, "valid").item()

In [21]:
# try to find the pairs somehow
pairs_avg_g1 = set()
for i in range(len(avg_g1)):
	j, _= find_best_next_segment(i, segments=avg_g1)	
	pairs_avg_g1.add((i, j))

In [23]:
count = 0
for i in range(len(avg_g1)):
	j, _ = find_best_next_segment(i, segments=avg_g1)	
	if (i, j) in pairs_avg_g1 and (j, i) in pairs_avg_g1:
		count += 1

print(count)

30


In [29]:
np.sort(np.abs(avg_g1[0]).mean(axis=0))[-5:], avg_g1[0].std(axis=1)

(array([1.6043775 , 1.61858961, 1.78668952, 2.00848914, 3.94706962]),
 array([0.96732131, 0.97947739, 0.97260987, 0.97247485, 0.93905281,
        0.97468263, 0.96306932, 0.95401796, 0.97552129, 0.97530294]))