# Divide the scan into different brain sections 

In [2]:
from itertools import chain
import os
import numpy as np
current_dir = os.getcwd()

# let's see how it goes
npy_file_path = os.path.join(current_dir, 'ihb.npy')

all_data = np.load(npy_file_path)

In [3]:
all_data.shape

(320, 10, 246)

In [4]:
import pandas as pd
num_nans = [np.isnan(all_data[i]).sum() for i in range(all_data.shape[0])]
nan_values = pd.Series(num_nans)
nan_values.value_counts()

0      160
460    160
Name: count, dtype: int64

In [5]:
g1 = [all_data[i] for i in range(all_data.shape[0]) if np.isnan(all_data[i]).sum() == 0]
g2 = [all_data[i] for i in range(all_data.shape[0]) if np.isnan(all_data[i]).sum() == 460]

g2_nan_distribution = np.zeros(shape=(len(g2), len(g2)))

for i1, element1 in enumerate(g2):
	for i2, element2 in enumerate(g2):
		g2_nan_distribution[i1][i2] = all(np.sum(np.isnan(element1), axis=1) == np.sum(np.isnan(element2), axis=1))

g2 = [scan[:, :-46] for scan in g2]

# The first step: group scans of the same scan but with different smoothing

In [6]:
# we know for a fact that we have 160 samples belonging to 20 subjects each having

# let's calculate the auto correlation of each time sequence in each of scan
from typing import Union, List
from scipy.signal import correlate2d, correlate

def autocorrelation_stats(scan: np.ndarray, aggregate:bool=True) -> Union[List, float]:	
	assert len(scan) == 10
	auto_correlations =  [float(correlate(scan[i:], scan[:-i])) for i in range(1, 6)]
	if aggregate:
		return np.mean(auto_correlations)
	return auto_correlations

def build_ac_pairs(scans: List[np.ndarray]) -> set:
	auto_corrs = np.zeros(shape=(len(scans), len(scans)))

	for i1, element1 in enumerate(scans):
		for i2, element2 in enumerate(scans):
			auto_corrs[i1][i2] = correlate2d(element1, element2, "valid").item()	
	
	# for each row, row[0] represents the closest index to scan[i] in terms of auto correlation
	# row[1] represents the same index
	paired_scans_by_ac = np.argsort(auto_corrs, axis=-1)[:, -2:]

	pairs = set()

	for i in range(len(scans)):
		assert paired_scans_by_ac[i, 1] == i, "check the code"
		closest_scan_index = paired_scans_by_ac[i, 0]
		if paired_scans_by_ac[closest_scan_index, 0] == i and (closest_scan_index, i) not in pairs:
			pairs.add((i, closest_scan_index)) 

	return pairs

In [7]:
g1_pairs, g2_pairs = build_ac_pairs(g1), build_ac_pairs(g2)

In [8]:
len(g1_pairs), len(g2_pairs)

(80, 80)

In [9]:
def unified_segment_rep(scans: List[np.ndarray], pairs_indices: set) -> List[np.ndarray]:
	avg_segments = []
	for i1, i2 in pairs_indices:
		s1, s2  = scans[i1], scans[i2]
		if s1.shape != s2.shape:
			raise ValueError("Make sure the code is correct. found pairs with different shapes")
		avg_segments.append((s1 + s2) / 2)
	return avg_segments

In [10]:
avg_g1, avg_g2 = unified_segment_rep(g1, g1_pairs), unified_segment_rep(g2, g2_pairs)

In [11]:
def compute_auto_corr_concatenation(seg1: np.ndarray, seg2: np.ndarray):
	assert seg1.shape == seg2.shape, "both segments same shape"
	c_seg1 = np.concatenate([seg1, seg2], axis=0)
	c_seg2 = np.concatenate([seg2, seg1], axis=0)

	assert c_seg1.shape[0] == 2 * seg1.shape[0] and c_seg1.shape[1] == seg1.shape[1], "concatenation correct"
	assert c_seg2.shape[0] == 2 * seg2.shape[0] and c_seg2.shape[1] == seg2.shape[1], "concatenation correct"

	c1 = np.mean([correlate2d(c_seg1[i:i + len(seg1), :], seg1, "valid").item() for i in range(len(seg1))])
	c2 = np.mean([correlate2d(c_seg2[i:i + len(seg1), :], seg2, "valid").item() for i in range(len(seg1))])
	return max(c1, c2)


# Next step: grouping the different segments of the same scan sequence

### Group 1

In [12]:
def find_consecutive_time_stamps(corrs: np.ndarray, k_closest:int):
	closest_groups = np.argsort(corrs, axis=-1)[:, -k_closest:-1]	

	possible_pairs = set()

	for i in range(len(corrs)):
		possible_neighbors = closest_groups[i]
		for pn in possible_neighbors:
			if i in closest_groups[pn]:
				possible_pairs.add((i, pn))

	return possible_pairs

In [13]:
# compute the mean and variance of each brain feature
g1_feats_mean = np.concatenate([np.mean(x, axis=0, keepdims=True) for x in avg_g1]).T
corr_g1 = np.corrcoef(g1_feats_mean, rowvar=False)
pairs_g1 = find_consecutive_time_stamps(corr_g1, 2)

In [14]:
len(pairs_g1), len(set(chain.from_iterable(pairs_g1)))

(62, 62)

In [15]:
from itertools import chain
paired_indices = set(chain.from_iterable(pairs_g1))
non_paired_indices = set(list(range(80))).difference(paired_indices)
non_paired_indices_list = sorted(list(non_paired_indices))
non_paired_indices_set = non_paired_indices
non_paired_indices_set

{2, 4, 6, 10, 13, 20, 21, 31, 34, 41, 45, 48, 55, 60, 65, 68, 71, 76}

In [20]:
# apply the same idea on non correlated points
non_paired_feats = g1_feats_mean[:, non_paired_indices_list]
non_paired_samples_corr = np.corrcoef(non_paired_feats, rowvar=False)
non_paired_samples_corr.shape

(18, 18)

In [None]:

paired_second_batch = find_consecutive_time_stamps(non_paired_samples_corr, 2)
paired_second_batch

In [18]:
pairs_g1.update(paired_second_batch)

In [19]:
set(list(range(80))).difference(set(chain.from_iterable(pairs_g1)))

pairs_g1.update([(20, 21), (31, 34), (41, 45), (48, 55), (60, 65), (68, 71), 76])

{20, 21, 31, 34, 41, 45, 48, 55, 60, 65, 68, 71, 76}

In [67]:
closest_by_mean = np.argsort(corr_g1, axis=-1)[:, -4:-1]
closest_by_mean = closest_by_mean[non_paired_indices_list, :]
closest_by_mean

array([[15, 14, 36],
       [66, 19, 43],
       [41, 11, 21],
       [70, 58, 13],
       [67, 10, 44],
       [ 1, 45, 43],
       [ 7,  6, 66],
       [70, 48, 61],
       [30, 53, 33],
       [ 6, 11, 10],
       [ 8, 43, 20],
       [31, 78, 32],
       [77, 36, 51],
       [53, 58, 68],
       [50, 66, 23],
       [ 0, 60, 25],
       [73, 32, 76],
       [17, 66, 12]], dtype=int64)

In [68]:
_best_pairing = []
for row_index, row in enumerate(closest_by_mean):	
	for j in closest_by_mean[row_index]:
		if j in non_paired_indices_set:
			_best_pairing.append((non_paired_indices_list[row_index], j))

_best_pairing

[(6, 41),
 (6, 21),
 (10, 13),
 (13, 10),
 (20, 45),
 (21, 6),
 (31, 48),
 (41, 6),
 (41, 10),
 (45, 20),
 (48, 31),
 (60, 68),
 (68, 60),
 (71, 76)]

In [69]:
_repeated_pairing = []
# add the new pairs to the existing pairs
for (i1, i2) in _best_pairing:
    if (i2, i1) in _best_pairing:
        _repeated_pairing.append((i1, i2))
        
print(_repeated_pairing)
_repeated_pairing.remove((6, 41))
_repeated_pairing.remove((41, 6))


[(6, 41), (6, 21), (10, 13), (13, 10), (20, 45), (21, 6), (31, 48), (41, 6), (45, 20), (48, 31), (60, 68), (68, 60)]


In [70]:
pairs_g1.update(_repeated_pairing)
len(pairs_g1), len(set(chain.from_iterable(pairs_g1)))

(72, 72)

In [71]:
set(list(range(80))).difference(set(chain.from_iterable(pairs_g1)))
random_pairs = [(2, 4), (34, 41), (55, 65), (71, 76)]
pairs_g1.update(random_pairs)

### Group 2

In [72]:
g2_feats_mean = np.concatenate([np.mean(x, axis=0, keepdims=True) for x in avg_g2]).T
# corr_feat_var = np.corrcoef(feats_var)
corr_g2 = np.corrcoef(g2_feats_mean, rowvar=False)
# pairs_with_var = find_consecutive_time_stamps(corr_feat_var, 2)
pairs_g2 = find_consecutive_time_stamps(corr_g2, 2)

In [73]:
paired_indices = set(chain.from_iterable(pairs_g2))
non_paired_indices = set(list(range(80))).difference(paired_indices)
non_paired_indices_list = sorted(list(non_paired_indices))
non_paired_indices_set = non_paired_indices
non_paired_indices_set

{1, 2, 4, 18, 28, 37, 39, 41, 43, 46, 49, 54, 57, 60, 69, 73, 76, 78}

In [74]:
closest_by_mean = np.argsort(corr_g1, axis=-1)[:, -4:-1]
closest_by_mean = closest_by_mean[non_paired_indices_list, :]
closest_by_mean

array([[20, 44, 42],
       [15, 14, 36],
       [66, 19, 43],
       [73, 75, 27],
       [76, 33, 59],
       [79, 78, 67],
       [59,  0, 61],
       [ 6, 11, 10],
       [20,  4, 19],
       [22, 47,  8],
       [43, 15, 38],
       [46, 12, 56],
       [14, 23, 64],
       [53, 58, 68],
       [ 7, 66, 52],
       [32, 33, 79],
       [17, 66, 12],
       [37, 67, 44]], dtype=int64)

In [75]:
_best_pairing = []
for row_index, row in enumerate(closest_by_mean):	
	for j in closest_by_mean[row_index]:
		if j in non_paired_indices_set:
			_best_pairing.append((non_paired_indices_list[row_index], j))

_best_pairing

[(4, 43), (18, 73), (28, 76), (37, 78), (43, 4), (49, 43), (54, 46), (78, 37)]

In [76]:
_repeated_pairing = []
# add the new pairs to the existing pairs
for (i1, i2) in _best_pairing:
    if (i2, i1) in _best_pairing:
        _repeated_pairing.append((i1, i2))
        
print(_repeated_pairing)
pairs_g2.update(_repeated_pairing)
# _repeated_pairing.remove((6, 41))
# _repeated_pairing.remove((41, 6))

[(4, 43), (37, 78), (43, 4), (78, 37)]


In [51]:
set(list(range(80))).difference(set(chain.from_iterable(pairs_g2)))
# random_pairs = [(2, 4), (34, 41), (55, 65), (71, 76)]
# pairs_g2.update(random_pairs)

{1, 2, 18, 28, 39, 41, 46, 49, 54, 57, 60, 69, 73, 76}

# Fourier Transform

In [45]:
import numpy.linalg as la
from scipy.fft import fft2

g1_fft = np.concatenate([np.mean(np.abs(fft2(x)), axis=0, keepdims=True) for x in avg_g1], axis=0)
g2_fft = np.concatenate([np.mean(np.abs(fft2(x)), axis=0, keepdims=True) for x in avg_g2], axis=0)


In [46]:
def get_fft_weights(x: np.ndarray, keepdims: bool=True):
    return np.mean(np.abs(fft2(x)), axis=0, keepdims=keepdims)

def get_top_freqs(x: np.ndarray, top_k:int) -> List[int]:
	fft_w = get_fft_weights(x, keepdims=False)
	if top_k is None:
		return np.argsort(fft_w).tolist()
	
	return np.argsort(fft_w, )[-top_k:].tolist()
    

In [47]:
from typing import Optional
from sklearn.metrics import dcg_score

y1, y2 = get_top_freqs(avg_g1[0], 15), get_top_freqs(avg_g1[1], 15)

dcg_score(y1, y2)

ValueError: Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got multiclass instead

In [21]:
def find_best_segment_by_fft(seg_index: int, segments: List[np.ndarray], num_freqs:Optional[int]):
	max_ac_corr = -float('inf')
	best_index = None

	top_freqs = get_top_freqs(segments[seg_index], top_k=num_freqs)

	for i in range(len(segments)):
		if i == seg_index: 
			continue
		# compound segment
		other_seg = segments[i]
		seg_weights = get_fft_weights(segments[seg_index])
		other_seg_weights = get_fft_weights(other_seg)
		corr = dcg_score(seg_weights, other_seg_weights)

		if corr > max_ac_corr:
			max_ac_corr = corr 
			best_index = i

	return best_index 



In [22]:
pairs_avg_g1 = set()
for i in range(len(avg_g1)):
	j= find_best_segment_by_fft(i, segments=avg_g1, num_freqs=None)	
	pairs_avg_g1.add((i, j))

In [None]:
count = 0
for i in range(len(avg_g1)):
	j = find_best_segment_by_fft(i, segments=avg_g1, num_freqs=20)	
	if (i, j) in pairs_avg_g1 and (j, i) in pairs_avg_g1:
		count += 1

print(count)

# simple submission

In [32]:
# g1_2d = np.concatenate([np.mean(x, axis=0, keepdims=True) for x in avg_g1], axis=0)
# g2_2d = np.concatenate([np.mean(x, axis=0, keepdims=True) for x in avg_g2], axis=0)

In [33]:
from sklearn.manifold import TSNE 
tsne = TSNE(n_components=2, random_state=0)
x_reduced = tsne.fit_transform(g1_fft)

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(8, 8))
plt.scatter(x_reduced[:, 0], x_reduced[:, 1])
plt.show()