In [1]:
# Load libraries

import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import umap
import warnings
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import HoverTool, ColumnDataSource
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from PIL import Image
from torchvision import transforms

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
warnings.filterwarnings("ignore")

# Set parameters
bird_dir = '/home/akapoor/Dropbox (University of Oregon)/Kapoor_Ananya/01_Projects/01_a_Rotations/Gardner_Lab/Canary_Data/llb3/'
audio_files = bird_dir+'llb3_songs'
directory = bird_dir+ 'llb3_data_matrices/Python_Files'
analysis_path = '/home/akapoor/Dropbox (University of Oregon)/Kapoor_Ananya/01_Projects/01_b_Canary_SSL/Canary_SSL_Repo/'

# Parameters we set
num_spec = 10
window_size = 100
stride = 10

# Define the folder name
folder_name = f'{analysis_path}Num_Spectrograms_{num_spec}_Window_Size_{window_size}_Stride_{stride}'

# Create the folder if it doesn't already exist
if not os.path.exists(folder_name+"/Plots/Window_Plots"):
    os.makedirs(folder_name+"/Plots/Window_Plots")
    print(f'Folder "{folder_name}" created successfully.')
else:
    print(f'Folder "{folder_name}" already exists.')


Folder "/home/akapoor/Dropbox (University of Oregon)/Kapoor_Ananya/01_Projects/01_b_Canary_SSL/Canary_SSL_Repo/Num_Spectrograms_10_Window_Size_100_Stride_10" already exists.


In [2]:
# =============================================================================
# # If you are loading the results from a previous analysis, run the following lines of code
# =============================================================================

stacked_windows = np.load(folder_name+'/stacked_windows.npy') # An array of all the mini-spectrograms
stacked_labels_for_window = np.load(folder_name+'/stacked_labels_for_window.npy') # The syllable labels for each time point in each mini-spectrogram
embedding = np.load(folder_name+'/UMAP_Embedding.npy') # The pre-computed UMAP embedding (2 dimensional)
masked_frequencies = np.load(analysis_path+'/masked_frequencies_lowthresh_500_highthresh_7000.npy') # The frequencies we want to use for analysis. Excluding unnecessarily low and high frequencies
stacked_window_times = np.load(folder_name+'/stacked_window_times.npy') # The onsets and ending of each mini-spectrogram
    
# open the file for reading in binary mode
with open(folder_name+'/category_colors.pkl', 'rb') as f:
    # load the dictionary from the file using pickle.load()
    category_colors = pickle.load(f)   
    
# Each syllable is given a unique color. Each mini-spectrogram will have an average syllable color associated with it. This is the average RGB value across all unique syllables in the mini-spectrogram
mean_colors_per_minispec = np.load(folder_name+'/mean_colors_per_minispec.npy')

In [2]:
# =============================================================================
# # If you're running the analysis for the first time 
# =============================================================================

files = os.listdir(directory)
all_songs_data = [element for element in files if '.npz' in element] # Get the file paths of each numpy file from Yarden's data
all_songs_data.sort()
os.chdir(directory)

# For each spectrogram we will extract
# 1. Each timepoint's syllable label
# 2. The spectrogram itself
stacked_labels = [] 
stacked_specs = []
for i in np.arange(num_spec):
    # Extract the data within the numpy file. We will use this to create the spectrogram
    dat = np.load(all_songs_data[i])
    spec = dat['s']
    times = dat['t']
    frequencies = dat['f']
    labels = dat['labels']
    labels = labels.T


    # Let's get rid of higher order frequencies
    mask = (frequencies<7000)&(frequencies>500)
    masked_frequencies = frequencies[mask]

    subsetted_spec = spec[mask.reshape(mask.shape[0],),:]
    
    stacked_labels.append(labels)
    stacked_specs.append(subsetted_spec)

    
stacked_specs = np.concatenate((stacked_specs), axis = 1)
stacked_labels = np.concatenate((stacked_labels), axis = 0)

# Get a list of unique categories (syllable labels)
unique_categories = np.unique(stacked_labels)

# Create a dictionary that maps categories to random colors
category_colors = {category: np.random.rand(3,) for category in unique_categories}

spec_for_analysis = stacked_specs.T
window_labels_arr = []
embedding_arr = []
# Find the exact sampling frequency (the time in miliseconds between one pixel [timepoint] and another pixel)
dx = np.diff(times)[0,0]

# We will now extract each mini-spectrogram from the full spectrogram
stacked_windows = []
# Find the syllable labels for each mini-spectrogram
stacked_labels_for_window = []
# Find the mini-spectrograms onset and ending times 
stacked_window_times = []

# The below for-loop will find each mini-spectrogram (window) and populate the empty lists we defined above.
for i in range(0, spec_for_analysis.shape[0] - window_size + 1, stride):
    # Find the window
    window = spec_for_analysis[i:i + window_size, :]
    # Get the window onset and ending times
    window_times = dx*np.arange(i, i + window_size)
    # We will flatten the window to be a 1D vector
    window = window.reshape(1, window.shape[0]*window.shape[1])
    # Extract the syllable labels for the window
    labels_for_window = stacked_labels[i:i+window_size, :]
    # Reshape the syllable labels for the window into a 1D array
    labels_for_window = labels_for_window.reshape(1, labels_for_window.shape[0]*labels_for_window.shape[1])
    # Populate the empty lists defined above
    stacked_windows.append(window)
    stacked_labels_for_window.append(labels_for_window)
    stacked_window_times.append(window_times)

# Convert the populated lists into a stacked numpy array
stacked_windows = np.stack(stacked_windows, axis = 0)
stacked_windows = np.squeeze(stacked_windows)

stacked_labels_for_window = np.stack(stacked_labels_for_window, axis = 0)
stacked_labels_for_window = np.squeeze(stacked_labels_for_window)

stacked_window_times = np.stack(stacked_window_times, axis = 0)

# For each mini-spectrogram, find the average color across all unique syllables
mean_colors_per_minispec = np.zeros((stacked_labels_for_window.shape[0], 3))
for i in np.arange(stacked_labels_for_window.shape[0]):
    list_of_colors_for_row = [category_colors[x] for x in stacked_labels_for_window[i,:]]
    all_colors_in_minispec = np.array(list_of_colors_for_row)
    mean_color = np.mean(all_colors_in_minispec, axis = 0)
    mean_colors_per_minispec[i,:] = mean_color
    
# Perform a UMAP embedding on the dataset of mini-spectrograms
reducer = umap.UMAP()
embedding = reducer.fit_transform(stacked_windows)

# Let's save all the numpy arrays
np.save(folder_name+'/stacked_windows.npy', stacked_windows)
np.save(folder_name+'/stacked_labels_for_window.npy', stacked_labels_for_window)
np.save(analysis_path+'/masked_frequencies_lowthresh_500_highthresh_7000.npy', masked_frequencies)
np.save(folder_name+'/stacked_window_times.npy', stacked_window_times)
np.save(folder_name+'/mean_colors_per_minispec.npy', mean_colors_per_minispec)

# open a file for writing in binary mode
with open(folder_name+'/category_colors.pkl', 'wb') as f:
    # write the dictionary to the file using pickle.dump()
    pickle.dump(category_colors, f)



In [None]:
# The below function will save an image for each mini-spectrogram. This will be used for understanding the UMAP plot.
def embeddable_image(data, window_times, iteration_number):
    
    data.shape = (window_size, int(data.shape[0]/window_size))
    data = data.T 
    window_times = window_times.reshape(1, window_times.shape[0])
    plt.pcolormesh(window_times, masked_frequencies, data, cmap='jet')
    # let's save the plt colormesh as an image.
    plt.savefig(folder_name+'/Plots/Window_Plots/'+f'Window_{iteration_number}.png')
    plt.close()
    
    
for i in np.arange(stacked_windows.shape[0]):
    if i%10 == 0:
        print(f'Iteration {i} of {stacked_windows.shape[0]}')
    data = stacked_windows[i,:]
    window_times = stacked_window_times[i,:]
    embeddable_image(data, window_times, i)

# UMAP Alone

In [None]:
# Perform a UMAP embedding on the dataset of mini-spectrograms
reducer = umap.UMAP()
embedding = reducer.fit_transform(stacked_windows)
np.save(folder_name+'/UMAP_Embedding.npy', embedding)

In [None]:
# The below function will save an image for each mini-spectrogram. This will be used for understanding the UMAP plot.
def embeddable_image(data, window_times, iteration_number):
    
    data.shape = (window_size, int(data.shape[0]/window_size))
    data = data.T 
    window_times = window_times.reshape(1, window_times.shape[0])
    plt.pcolormesh(window_times, masked_frequencies, data, cmap='jet')
    # let's save the plt colormesh as an image.
    plt.savefig(folder_name+'/Plots/Window_Plots/'+f'Window_{iteration_number}.png')
    plt.close()
    
    
for i in np.arange(stacked_windows.shape[0]):
    if i%10 == 0:
        print(f'Iteration {i} of {stacked_windows.shape[0]}')
    data = stacked_windows[i,:]
    window_times = stacked_window_times[i,:]
    embeddable_image(data, window_times, i)


In [4]:
# Specify an HTML file to save the Bokeh image to.
output_file(filename=f'{folder_name}/Plots/umap.html')

# Convert the UMAP embedding to a Pandas Dataframe
spec_df = pd.DataFrame(embedding, columns=('x', 'y'))


# Create a ColumnDataSource from the data. This contains the UMAP embedding components and the mean colors per mini-spectrogram
source = ColumnDataSource(data=dict(x = embedding[:,0], y = embedding[:,1], colors=mean_colors_per_minispec))


# Create a figure and add a scatter plot
p = figure(width=800, height=600, tools=('pan, box_zoom, hover, reset'))
p.scatter(x='x', y='y', size = 7, color = 'colors', source=source)

hover = p.select(dict(type=HoverTool))
hover.tooltips = """
    <div>
        <h3>@x, @y</h3>
        <div>
            <img
                src="@image" height="100" alt="@image" width="100"
                style="float: left; margin: 0px 15px 15px 0px;"
                border="2"
            ></img>
        </div>
    </div>
"""

p.add_tools(HoverTool(tooltips="""
"""))


# Set the image path for each data point
source.data['image'] = []
for i in np.arange(spec_df.shape[0]):
    source.data['image'].append(f'{folder_name}/Plots/Window_Plots/Window_{i}.png')

show(p)

save(p)


'/home/akapoor/Dropbox (University of Oregon)/Kapoor_Ananya/01_Projects/01_b_Canary_SSL/Canary_SSL_Repo/Num_Spectrograms_10_Window_Size_100_Stride_10/Plots/umap.html'

# Noise Contrastive Estimation

In [23]:
if torch.cuda.is_available():
  device = "cuda:0"
else:
  device = "cpu"

# Let's shuffle the stacked_windows and stacked_labels_for_window

# Shuffle the indices of stacked_windows
shuffled_indices = np.random.permutation(stacked_windows.shape[0])

# Shuffle array1 using the shuffled indices
stacked_windows_NCE = stacked_windows[shuffled_indices,:]

# Shuffle array2 using the same shuffled indices
stacked_labels_for_window_NCE = stacked_labels_for_window[shuffled_indices,:]

In [24]:
# %% Define neural network

class TweetyNetCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(
            in_channels=1, out_channels=32, kernel_size=(5, 5), stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=(8, 1), stride=(8, 1))
        self.conv2 = nn.Conv2d(
            in_channels=32, out_channels=64, kernel_size=(5, 5), stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=(8, 1), stride=(8, 1))
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(9408, 1000)
        self.fc2 = nn.Linear(1000, 100)
        self.fc3 = nn.Linear(100, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool2(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        # x = self.relu(x)
        x = self.fc2(x)
        # x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

cnn_model = TweetyNetCNN()
cnn_model = cnn_model.double()
# cnn_model = cnn_model.double().to(device)

optimizer = optim.Adam(cnn_model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.BCELoss()


In [25]:
stacked_windows_tensor = stacked_windows_NCE
stacked_windows_tensor.shape = (stacked_windows_NCE.shape[0], 1, 100, 151)
stacked_windows_tensor = torch.tensor(stacked_windows_NCE).double()


actual_labels = np.max(stacked_labels_for_window_NCE, axis=1)
actual_labels = torch.tensor(actual_labels)

# Determine the split point for testing data
split_point = int(0.7 * stacked_windows_tensor.shape[0])

# Split the data into training and testing sets
train_data, test_data = stacked_windows_tensor[:split_point,:,:,:], stacked_windows_tensor[split_point:,:,:,:]
train_labels, test_labels = actual_labels[:split_point], actual_labels[split_point:]

# Convert the numpy arrays to PyTorch tensors
train_data_tensor = torch.tensor(train_data)
test_data_tensor = torch.tensor(test_data)
train_labels_tensor = torch.tensor(train_labels)  # Assuming integer labels
test_labels_tensor = torch.tensor(test_labels)

# Create TensorDatasets for training and testing sets
train_dataset = TensorDataset(train_data_tensor, train_labels_tensor)
test_dataset = TensorDataset(test_data_tensor, test_labels_tensor)

# Define batch size for DataLoader
batch_size = 32

# Create DataLoaders for training and testing sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle test data


# Choose a random 64 indices from our total data

# batch_indices = np.random.randint(0, stacked_windows_tensor.shape[0], batch_size)

# x = stacked_windows_tensor[batch_indices, :,:,:]
# y = actual_labels[batch_indices]

# # Create a TensorDataset to combine features and labels
# dataset = TensorDataset(stacked_windows_tensor, actual_labels)

# train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# # Extract the first batch
# for batch in train_loader:
#     first_batch = batch
#     break  # Exit the loop after the first iteration

# x = first_batch[0]
# y = first_batch[1]

positive_sample_index_list = []
negative_samples_indices_list = []


unique_syllables = torch.unique(actual_labels)

indices_dict = {int(element): np.where(actual_labels == element)[0] for element in unique_syllables}
num_negative_samples_each = 5


Extract the embedding from the non-trained model

In [26]:
cnn_model = cnn_model.to('cpu')
model_embedding_arr_untrained = torch.empty((0,1000)).to('cpu')
for batch_idx, (data, targets) in enumerate(train_loader):
    x = cnn_model.conv1(data.to('cpu'))
    x = cnn_model.relu(x)
    x = cnn_model.pool1(x)
    x = cnn_model.conv2(x)
    x = cnn_model.relu(x)
    x = cnn_model.pool2(x)
    x = torch.flatten(x, 1)
    x = cnn_model.fc1(x)
    model_embedding_arr_untrained = torch.concatenate((model_embedding_arr_untrained, x))

In [27]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.054430GB
torch.cuda.memory_reserved: 20.580078GB
torch.cuda.max_memory_reserved: 20.580078GB


In [28]:
reducer = umap.UMAP()
umap_embedding_untrained = reducer.fit_transform(model_embedding_arr_untrained.detach().numpy())

In [29]:
mean_colors_per_minispec_training = mean_colors_per_minispec[shuffled_indices[:split_point],:]
mean_colors_per_minispec_testing = mean_colors_per_minispec[shuffled_indices[split_point:],:]

In [30]:
# Specify an HTML file to save the Bokeh image to.
output_file(filename=f'{folder_name}/Plots/umap_of_TweetyNet_Embed_NCE_untrained.html')

# Convert the UMAP embedding to a Pandas Dataframe
spec_df = pd.DataFrame(umap_embedding_untrained, columns=('x', 'y'))


# Create a ColumnDataSource from the data. This contains the UMAP embedding components and the mean colors per mini-spectrogram
source = ColumnDataSource(data=dict(x = umap_embedding_untrained[:,0], y = umap_embedding_untrained[:,1], colors=mean_colors_per_minispec_training))


# Create a figure and add a scatter plot
p = figure(width=800, height=600, tools=('pan, box_zoom, hover, reset'))
p.scatter(x='x', y='y', size = 7, color = 'colors', source=source)

hover = p.select(dict(type=HoverTool))
hover.tooltips = """
    <div>
        <h3>@x, @y</h3>
        <div>
            <img
                src="@image" height="100" alt="@image" width="100"
                style="float: left; margin: 0px 15px 15px 0px;"
                border="2"
            ></img>
        </div>
    </div>
"""

p.add_tools(HoverTool(tooltips="""
"""))


# Set the image path for each data point
source.data['image'] = []
for i in np.arange(spec_df.shape[0]):
    source.data['image'].append(f'{folder_name}/Plots/Window_Plots/Window_{shuffled_indices[:split_point][i]}.png')

show(p)
save(p)

'/home/akapoor/Dropbox (University of Oregon)/Kapoor_Ananya/01_Projects/01_b_Canary_SSL/Canary_SSL_Repo/Num_Spectrograms_10_Window_Size_100_Stride_10/Plots/umap_of_TweetyNet_Embed_NCE_untrained.html'

In [31]:
dat.shape

torch.Size([14, 1, 100, 151])

In [32]:
cnn_model = TweetyNetCNN()
cnn_model = cnn_model.double().to(device)

In [33]:
# %% Train Neural network 

cnn_model.train()
cnn_model = cnn_model.to(device)
batch_size = 64
num_epoch = 10
mean_batch_loss_per_epoch_list = []
for epoch in np.arange(num_epoch):
    total_batch_loss_list = []
    for batch_idx, (data, targets) in enumerate(train_loader):
        batch_loss = 0
        for anchor_index in np.arange(data.shape[0]):
            anchor_label = targets[anchor_index]
            # Sample a positive sample from our total dataset
            indices_of_positve_samples = torch.where(actual_labels == anchor_label)[0]
            positive_sample_index = torch.randint(0, indices_of_positve_samples.shape[0], size=(1,))
            positive_sample_index_list.append(positive_sample_index.item())

            # Sample negative samples from our total dataset. We will use a 
            # weighted probability distribution. There will be a 0.0001 probability
            # that we will sample from the positive class and a (1-0.001)/(K-1)
            # probability that we sample from the remaining K-1 classes

            # Create a tensor with custom probabilities
            epsilon = 0.0001
            probs = torch.zeros((1, unique_syllables.shape[0]))
            probs[:,:] = (1-epsilon)/(probs.shape[1]-1)
            index_of_anchor_label = torch.where(unique_syllables == anchor_label)
            probs[:,int(index_of_anchor_label[0])] = epsilon


            # Number of samples to generate
            num_samples = unique_syllables.shape[0] - 1

            # Sample indices based on the custom probabilities
            sampled_labels = torch.multinomial(probs, num_samples, replacement=False)

            # Now let's randomly sample an index value from each sampled label

            random_samples = {key: np.random.choice(values, num_negative_samples_each) for key, values in indices_dict.items() if key in sampled_labels}
            indices_of_negative_samples = np.array(list(random_samples.values()))
            negative_samples_indices_list.append(indices_of_negative_samples)
            a = np.stack(indices_of_negative_samples)
            indices_of_negative_samples = np.stack(indices_of_negative_samples).reshape(a.shape[0]*a.shape[1],)


            # Now let's extract the positive and negative samples' spectrogram 
            # slices

            positive_sample = stacked_windows_tensor[positive_sample_index, :,:,:]
            negative_samples = stacked_windows_tensor[indices_of_negative_samples, :,:,:]

            dat = torch.concatenate((positive_sample, negative_samples))

            artificial_labels = torch.zeros((1,1 + (unique_syllables.shape[0]-1)*num_negative_samples_each))
            artificial_labels[:,0] = 1

            # Get the number of rows in the tensors
            num_rows = dat.shape[0]

            # Generate a random permutation of indices
            shuffled_indices_pseudo = torch.randperm(num_rows)

            # Shuffle both tensors based on the same indices
            dat = dat[shuffled_indices_pseudo,:,:,:].to(device)
            artificial_labels = artificial_labels[:,shuffled_indices_pseudo].to(device)

            pred_probs = cnn_model(dat)

            # h_t = 1/(1+torch.exp(-1*(torch.log(pred_probs) - torch.log(probs).T)))
            # torch.sum(artificial_labels.T*torch.log(h_t) + (1-artificial_labels.T)*torch.log(h_t))

            loss = criterion(pred_probs, artificial_labels.T.double())
            batch_loss+=loss

        total_batch_loss_list.append(batch_loss.item())
        # print(f'Batch Loss: {batch_loss.item():.4f}')
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
    mean_batch_loss_per_epoch = np.mean(total_batch_loss_list)
    mean_batch_loss_per_epoch_list.append(mean_batch_loss_per_epoch)
    print("=================================================================")
    print(f'Epoch {epoch}, Mean Batch Loss: {mean_batch_loss_per_epoch:.4f}')
    print("=================================================================")


Epoch 0, Mean Batch Loss: 20.9646
Epoch 1, Mean Batch Loss: 20.9647
Epoch 2, Mean Batch Loss: 20.9646
Epoch 3, Mean Batch Loss: 20.9646
Epoch 4, Mean Batch Loss: 20.9647
Epoch 5, Mean Batch Loss: 20.9648
Epoch 6, Mean Batch Loss: 20.9645
Epoch 7, Mean Batch Loss: 20.9646
Epoch 8, Mean Batch Loss: 20.9644
Epoch 9, Mean Batch Loss: 20.9647


In [14]:
torch.cuda.memory_summary(device=None, abbreviated=False)




In [34]:
cnn_model.to('cpu').eval()
model_embedding_arr = torch.empty((0,1000)).to('cpu')
for batch_idx, (data, targets) in enumerate(train_loader):
    x = cnn_model.conv1(data.to('cpu'))
    x = cnn_model.relu(x)
    x = cnn_model.pool1(x)
    x = cnn_model.conv2(x)
    x = cnn_model.relu(x)
    x = cnn_model.pool2(x)
    x = torch.flatten(x, 1)
    x = cnn_model.fc1(x)
    model_embedding_arr = torch.concatenate((model_embedding_arr, x))

In [36]:
reducer = umap.UMAP()
embedding_umap = reducer.fit_transform(model_embedding_arr.cpu().detach().numpy())

In [16]:
mean_colors_per_minispec_training = mean_colors_per_minispec[shuffled_indices[:split_point],:]
mean_colors_per_minispec_testing = mean_colors_per_minispec[shuffled_indices[split_point:],:]

UMAP Representation of Untrained Model

In [44]:
# Specify an HTML file to save the Bokeh image to.
output_file(filename=f'{folder_name}/Plots/umap_of_TweetyNet_Embed_NCE_trained.html')

# Convert the UMAP embedding to a Pandas Dataframe
spec_df = pd.DataFrame(embedding_umap, columns=('x', 'y'))


# Create a ColumnDataSource from the data. This contains the UMAP embedding components and the mean colors per mini-spectrogram
source = ColumnDataSource(data=dict(x = embedding_umap[:,0], y = embedding_umap[:,1], colors=mean_colors_per_minispec_training))


# Create a figure and add a scatter plot
p = figure(width=800, height=600, tools=('pan, box_zoom, hover, reset'))
p.scatter(x='x', y='y', size = 7, color = 'colors', source=source)

hover = p.select(dict(type=HoverTool))
hover.tooltips = """
    <div>
        <h3>@x, @y</h3>
        <div>
            <img
                src="@image" height="100" alt="@image" width="100"
                style="float: left; margin: 0px 15px 15px 0px;"
                border="2"
            ></img>
        </div>
    </div>
"""

p.add_tools(HoverTool(tooltips="""
"""))


# Set the image path for each data point
source.data['image'] = []
for i in np.arange(spec_df.shape[0]):
    source.data['image'].append(f'{folder_name}/Plots/Window_Plots/Window_{shuffled_indices[:split_point][i]}.png')

show(p)
save(p)

'/home/akapoor/Dropbox (University of Oregon)/Kapoor_Ananya/01_Projects/01_b_Canary_SSL/Canary_SSL_Repo/Num_Spectrograms_10_Window_Size_100_Stride_10/Plots/umap_of_TweetyNet_Embed_NCE_trained.html'

In [42]:
folder_name

'/home/akapoor/Dropbox (University of Oregon)/Kapoor_Ananya/01_Projects/01_b_Canary_SSL/Canary_SSL_Repo/Num_Spectrograms_10_Window_Size_100_Stride_10'

In [None]:
# Specify an HTML file to save the Bokeh image to.
output_file(filename='umap_of_TweetyNet_Embed_NCE.html')

# Convert the UMAP embedding to a Pandas Dataframe
spec_df = pd.DataFrame(embedding_umap, columns=('x', 'y'))


# Create a ColumnDataSource from the data. This contains the UMAP embedding components and the mean colors per mini-spectrogram
source = ColumnDataSource(data=dict(x = embedding_umap[:,0], y = embedding_umap[:,1], colors=mean_colors_per_minispec_training))


# Create a figure and add a scatter plot
p = figure(width=800, height=600, tools=('pan, box_zoom, hover, reset'))
p.scatter(x='x', y='y', size = 7, color = 'colors', source=source)

hover = p.select(dict(type=HoverTool))
hover.tooltips = """
    <div>
        <h3>@x, @y</h3>
        <div>
            <img
                src="@image" height="100" alt="@image" width="100"
                style="float: left; margin: 0px 15px 15px 0px;"
                border="2"
            ></img>
        </div>
    </div>
"""

p.add_tools(HoverTool(tooltips="""
"""))


# Set the image path for each data point
source.data['image'] = []
for i in np.arange(spec_df.shape[0]):
    source.data['image'].append(f'{folderpath_song}/Plots/Window_Plots/Window_{shuffled_indices[split_point:]}.png')

show(p)

# save(p)
