## Statistical Data Analysis

In [1]:
from dataloader import get_loader
root = '/Volumes/Datasets/inner_speech/derivatives/'
creater = get_loader(root)
xn, yn = creater.load_multiple_subjects([1, 2, 3, 4, 5, 6, 7, 8]) 

In [15]:
xn.shape

(1240, 128, 1153)

In [16]:
yn.shape

(1240, 4)

## Data Loader strategy

### 1. Load all data of a subject in session 1 and session 2 (dont load 3, because noisy)
### 2. select only inner speech Y[:, 2] ==1
### 3. Stack all subjects recordings along batch axis = 0 (n_recording * n_subjects, 128, 1153)

## Clustering

In [None]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [40]:
creater = get_loader(root)
#loader = creater([1, 2])

In [44]:
xn, yn = creater.load_multiple_subjects([1, 2, 3, 4, 5, 6, 7, 8]) 

In [45]:
X = xn.reshape(-1, 128*1153)
print(X.shape)

(1240, 147584)


### PCA transform

In [None]:
var = 0.98
pca = PCA(var)
pca.fit(X)

PCA(n_components=0.98)

In [None]:
print("Number of components before PCA  = " + str(X.shape[1]))
print("Number of components after PCA 0.98 = " + str(pca.n_components_))

Number of components before PCA  = 147584
Number of components after PCA 0.98 = 151


In [None]:
Clus_dataSet = pca.transform(X)
print("Dimension of our data after PCA = " + str(Clus_dataSet.shape))


Dimension of our data after PCA = (160, 151)


### Kmeans clustering

In [None]:
k_means = KMeans(init = "k-means++", n_clusters = 4, n_init = 10)
k_means.fit(Clus_dataSet)

KMeans(n_clusters=4)

In [None]:
k_means_labels = k_means.predict(Clus_dataSet)
k_means_labels

array([3, 3, 2, 3, 3, 2, 1, 1, 2, 3, 1, 3, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3,
       1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 1, 1, 3,
       1, 1, 0, 1, 1, 3, 3, 3, 3, 3, 3, 1, 1, 2, 3, 3, 1, 3, 1, 1, 3, 3,
       3, 3, 3, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 3, 3, 1, 3, 1, 3, 1, 1,
       3, 3, 3, 2, 1, 1, 3, 3, 3, 3, 1, 1, 3, 2, 1, 3, 1, 1, 1, 2, 1, 1,
       1, 3, 2, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 3, 2, 1, 1, 3,
       1, 2, 3, 1, 3, 3, 1, 3, 1, 3, 3, 1, 2, 1, 1, 1, 3, 3, 1, 2, 1, 1,
       1, 3, 3, 3, 1, 3], dtype=int32)

In [None]:
yn[:, 1]

array([2, 2, 4, 4, 4, 1, 3, 4, 4, 2, 1, 2, 4, 4, 2, 3, 1, 3, 1, 4, 2, 1,
       2, 2, 4, 3, 3, 1, 2, 3, 3, 1, 2, 3, 4, 1, 3, 1, 1, 3, 4, 1, 3, 1,
       2, 2, 4, 1, 3, 4, 4, 1, 3, 2, 2, 2, 4, 2, 2, 3, 4, 2, 2, 1, 3, 4,
       4, 1, 3, 1, 3, 3, 1, 1, 1, 2, 3, 4, 4, 3, 2, 2, 3, 3, 4, 1, 1, 3,
       2, 2, 4, 2, 3, 1, 4, 1, 1, 1, 3, 2, 2, 2, 4, 3, 2, 3, 1, 2, 3, 4,
       3, 4, 4, 1, 4, 4, 3, 4, 1, 1, 2, 2, 4, 2, 1, 4, 3, 3, 1, 1, 1, 1,
       4, 1, 2, 3, 3, 3, 2, 3, 2, 3, 4, 4, 1, 4, 1, 1, 4, 2, 3, 2, 2, 1,
       4, 3, 4, 4, 2, 3])

# Frequency Analysis

In [None]:
# Sort data based on classes
labels = ["Up", "Down", "Right", "Left"] 
sorted_data = []
for i in range(1,5):
    mask = classes == i
    data = xn[mask,:,:]
    sorted_data.append(data)

In [None]:
# Time Plots
t = np.arange(0,1153) * 1/fs
n = 4 # play with this and figsize to get better images
fig, axs = plt.subplots(nrows=128, ncols=4, figsize=(4*n,128*n))
for i in range(0,4):
    label = labels[i]
    data = sorted_data[i]
    avg_trial = np.mean(data, axis=0)
    for j in range(0,128):
        axs[j,i].plot(t, avg_trial[j,:])
        axs[j,i].set_title('Channel: {} Class: {}'.format(j,label))
plt.show()

In [None]:
# Average and Combined Time Plots
# Average of all channels
t = np.arange(0,1153) * 1/fs
fig, axs = plt.subplots(nrows=4, ncols=1,figsize=(10,20))
for i in range(0,4):
    label = labels[i]
    data = sorted_data[i]
    avg_trial = np.mean(data, axis=0)
    for j in range(0,128):
        axs[i].plot(t, avg_trial[j,:],'b')
        
    avg = np.mean(avg_trial,0)
    axs[i].plot(t, avg,'r', label='Average')
    axs[i].set_title('Class: {}'.format(label))
    axs[i].legend()

In [None]:
# Spectrograms
n = 4
fig, axs = plt.subplots(nrows=128, ncols=4, figsize=(4*n,128*n))
for i in range(0,4):
    label = labels[i]
    data = sorted_data[i]
    avg_trial = np.mean(data, axis=0)
    for j in range(0,128):
        pxx,  freq, t, cax = axs[j,i].specgram(avg_trial[j,:], Fs=254, cmap="rainbow", mode='magnitude', NFFT=NFFT, noverlap=NFFT/2)
        axs[j,i].set_title('Channel: {} Class: {}'.format(j,label))
#fig.colorbar(cax)
plt.show()

In [None]:
# Average Spectrograms
# Average of all channels
n = 4
fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(10,20))
for i in range(0,4):
    label = labels[i]
    data = sorted_data[i]
    avg_trial = np.mean(data, axis=0)
    avg = np.mean(avg_trial, axis=0)
    pxx,  freq, t, cax = axs[i].specgram(avg_trial[j,:], Fs=254, cmap="rainbow", mode='magnitude', NFFT=NFFT, noverlap=NFFT/2)
    axs[i].set_title('Class: {}'.format(label))
#fig.colorbar(cax)
plt.show()

**^This doesn't actually look useful. Though, the most powerful frequencies are in the lower end of the spectrum, which makes sense.**

In [None]:
# Power Spectral Density Plots
n = 5
fig, axs = plt.subplots(nrows=128, ncols=4, figsize=(6*n,150*n))
for i in range(0,4):
    label = labels[i]
    data = sorted_data[i]
    avg_trial = np.mean(data, axis=0)
    for j in range(0,128):
        pxx, freqs = axs[j,i].psd(avg_trial[j,:], Fs=254,NFFT=NFFT, noverlap=NFFT/2, scale_by_freq=False)
        axs[j,i].set_title('Channel: {} Class: {}'.format(j,label))

plt.show()

In [None]:
# Average PSDs
# Average of all channels
n = 4
fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(10,20))
for i in range(0,4):
    label = labels[i]
    data = sorted_data[i]
    avg_trial = np.mean(data, axis=0)
    pows = 1
    pows = np.zeros((128,len(pxx)))
    for j in range(0,128):
        pxx, freqs = axs[i].psd(avg_trial[j,:], Fs=254,NFFT=NFFT, noverlap=NFFT/2, scale_by_freq=False, c='b')
        pows[j,:] = pxx
    
    avg = np.mean(avg_trial, axis=0)
    #avg = np.mean(pows, axis=0)
    #axs[i].plot(freqs, avg, 'r', label="Average")
    axs[i].psd(avg, Fs=254,NFFT=NFFT, noverlap=NFFT/2, scale_by_freq=False, c='r', label="Average")
    axs[i].set_title('Channel: {} Class: {}'.format(j,label))
    axs[i].legend()
    

plt.show()