# Demo Notebook

This notebook aims to represent the DeepLabCut output analysis, using example data. 
So we begin by importing our packages, called for now "dlc". 

In [1]:
import dlc

#### Loading data
As an example, we will first work on a single file. Later, we'll try batch processing. We define the path where our file is, and with the function
`dlc.load_data.read_data` we will load our `.h5` file. 

In [None]:
h5_path =  '/Users/annateruel/ca2+img-anna-2023-01-30/videos'
data = dlc.load_data.read_data( ,'/Users/annateruel/ca2+img-anna-2023-01-30/videos/0_neutro2DLC_resnet50_ca2+imgJan30shuffle1_500000.h5' 5, 30)

Since on this data file we have performed a tracking on the cage, we will define a list of bodyparts we want to work with. 

In [None]:
bodyparts = ['nose', 'rightear', 'leftear', 'head', 'sp1', 'sp2', 'tail']

In [2]:
import pandas as pd 
data = pd.read_hdf('/Volumes/ANNA_HD/ANALYSIS/EXPERIMENTS/2024/22-09-sr/analysis_tracked_roi/dlc/AD21-087-HAB2_bwDLC_resnet50_conductaJan11shuffle1_500000_filtered.h5')

In [None]:
df2 = data.copy()

#### Get centroid from a list of bodyparts

One of the functions we can perform from the `data.py` file is to calculate the centroid of specified bodyparts in each dataframe. We can add this centroid coordinates (x,y) to our current dataframe. Let's try that!

#### Calculate interpolation

If the tracking of some bodyparts is not perfect, we can get the interpolation of those values with likelihood under a certain threshold. 

In [4]:
df2 = data.copy() #we get a copy of the dataframe and work from there

In [6]:
interpolator = dlc.data.Interpolation(threshold=0.95, interpolation_method='linear')
interpolator.get_interpolation(df = df2, bodyparts = ['lc', 'lr', 'rc', 'rr'])

Interpolating 1020 points for lc.
NaNs after interpolation for lc x: 0
NaNs after interpolation for lc y: 0
Interpolating 504 points for lr.
NaNs after interpolation for lr x: 0
NaNs after interpolation for lr y: 0
Interpolating 8641 points for rc.
NaNs after interpolation for rc x: 23
NaNs after interpolation for rc y: 23
Interpolating 7688 points for rr.
NaNs after interpolation for rr x: 23
NaNs after interpolation for rr y: 23


scorer,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000,DLC_resnet50_conductaJan11shuffle1_500000
bodyparts,nose,nose,nose,rightear,rightear,rightear,leftear,leftear,leftear,head,...,lc,lr,lr,lr,rc,rc,rc,rr,rr,rr
coords,x,y,likelihood,x,y,likelihood,x,y,likelihood,x,...,likelihood,x,y,likelihood,x,y,likelihood,x,y,likelihood
0,192.576813,90.042381,0.989976,201.826263,77.582146,0.997141,209.012009,86.990158,0.998314,206.462845,...,0.999050,202.102158,67.029243,0.996905,,,,,,
1,192.472772,89.957989,0.995514,201.802455,77.554642,0.997967,208.990838,86.961843,0.998370,206.449585,...,0.998461,202.094839,66.990569,0.996441,,,,,,
2,191.727154,88.780883,0.998694,201.684684,77.226272,0.998939,208.665101,87.170066,0.999130,206.196418,...,0.998872,202.285588,66.670506,0.996053,,,,,,
3,189.864682,86.294754,0.998703,201.453250,75.604009,0.998922,206.270972,85.950256,0.999149,206.003571,...,0.998904,202.530027,66.403379,0.996617,,,,,,
4,189.715138,85.881709,0.999165,201.488727,75.449877,0.999673,206.168562,85.855557,0.999113,206.070615,...,0.998601,202.553277,66.400060,0.992115,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15871,185.534588,119.485684,0.999939,177.114119,130.294038,0.999337,167.374024,119.293206,0.999396,167.856163,...,0.999980,203.501753,79.485422,0.999978,275.948905,146.006132,0.992301,301.329633,148.732191,0.998251
15872,185.587940,119.425258,0.999938,177.143742,130.185756,0.999347,167.398549,119.306177,0.999397,167.888614,...,0.999980,203.481596,79.464383,0.999977,275.948905,146.006132,0.992301,301.329633,148.732191,0.998251
15873,185.551383,119.445118,0.999939,177.133494,130.186139,0.999363,167.378805,119.310071,0.999404,167.874367,...,0.999980,203.472673,79.484302,0.999976,275.948905,146.006132,0.992301,301.329633,148.732191,0.998251
15874,185.516088,119.435682,0.999937,177.132835,130.184303,0.999337,167.305087,119.283849,0.999393,167.822864,...,0.999980,203.507526,79.462233,0.999977,275.948905,146.006132,0.992301,301.329633,148.732191,0.998251


In [None]:
import numpy as np
DLCscorer = df2.columns[0][0]
threshold = 0.95
likelihood_col = (DLCscorer, 'nose', 'likelihood')
if df2[likelihood_col].lt(threshold).any():
    mask = df2[likelihood_col] < threshold

    df2.loc[mask, (DLCscorer, 'nose', 'x')] = np.nan
    df2.loc[mask, (DLCscorer, 'nose', 'y')] = np.nan

    df2[(DLCscorer, 'nose', 'x')] = df2[(DLCscorer, 'nose', 'x')].interpolate(method='linear')
    df2[(DLCscorer, 'nose', 'y')] = df2[(DLCscorer, 'nose', 'y')].interpolate(method='linear')

nose2 = df2.loc[:,(DLCscorer, 'nose', slice(None))]
nose2

In [22]:
lc = data.loc[:, (slice(None), 'rr', slice(None))]
lc.loc[:,(slice(None), slice(None), 'x')]

scorer,DLC_resnet50_conductaJan11shuffle1_500000
bodyparts,rr
coords,x
0,
1,
2,
3,
4,
...,...
15871,301.329633
15872,301.329633
15873,301.329633
15874,301.329633


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

fig, ax = plt.subplots(figsize=(10, 10))

x = lc.loc[:, (slice(None), slice(None), 'x')].values
y = lc.loc[:, (slice(None), slice(None), 'y')].values
likelihood = lc.loc[:, (slice(None), slice(None), 'likelihood')].values.flatten()  # Flatten the array

x_filled = pd.DataFrame(x).fillna(method='ffill').fillna(method='bfill').values
y_filled = pd.DataFrame(y).fillna(method='ffill').fillna(method='bfill').values
likelihood_filled = pd.Series(likelihood).fillna(method='ffill').fillna(method='bfill').values


# Create segments for LineCollection
points = np.array([x, y]).T.reshape(-1, 1, 2)
segments = np.concatenate([points[:-1], points[1:]], axis=1)

# Create a LineCollection object with segments and colormap
colormap = plt.cm.RdYlGn
lc = LineCollection(segments, cmap=colormap, norm=plt.Normalize(likelihood.min(), likelihood.max()), linewidth=0.5, alpha=0.5)
lc.set_array(likelihood)

ax.add_collection(lc)

# Add a colorbar to the plot to show what each color represents
cbar = plt.colorbar(lc, ax=ax)
cbar.set_label('Likelihood Value')

ax.set_xlim(x.min(), x.max())
ax.set_ylim(y.min(), y.max())

plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

def plot_data(ax, data, title):
    x = data.loc[:, (slice(None), slice(None), 'x')].values
    y = data.loc[:, (slice(None), slice(None), 'y')].values
    likelihood = data.loc[:, (slice(None), slice(None), 'likelihood')].values.flatten()

    # Create segments for LineCollection
    points = np.array([x, y]).T.reshape(-1, 1, 2)
    segments = np.concatenate([points[:-1], points[1:]], axis=1)

    # Create a LineCollection object with segments and colormap
    colormap = plt.cm.RdYlGn
    lc = LineCollection(segments, cmap=colormap, norm=plt.Normalize(likelihood.min(), likelihood.max()), linewidth=0.5, alpha=0.5)
    lc.set_array(likelihood)

    ax.add_collection(lc)
    ax.set_xlim(x.min(), x.max())
    ax.set_ylim(y.min(), y.max())
    ax.set_title(title)
    
    return lc

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

lc1 = plot_data(ax1, nose, "Before Interpolation")
lc2 = plot_data(ax2, nose2, "After Interpolation")

cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
cbar = fig.colorbar(lc2, cax=cbar_ax)
cbar.set_label('Likelihood Value')

plt.tight_layout()
plt.show()


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
lc1 = plot_data(ax1, nose, "Before Interpolation")
lc2 = plot_data(ax2, nose2, "After Interpolation")

## ROI Drawer

Testing ROI Drawer

In [27]:
from dlc.analysis.time_roi import ROIDrawer
import pandas as pd

In [None]:
roi_drawer = ROIDrawer(video_path='/Users/annateruel/Desktop/videos/7_WIN_20231010_094516_1.mp4',save_dir='/Users/annateruel/Desktop/videos/', num_rois=4)
r = roi_drawer.draw_rois()

## Testing time in ROI

1. Define ROIs

In [1]:
import dlc.analysis.time_roi as time_roi
import pandas as pd
import os

In [None]:
r = pd.read_hdf('/Users/annateruel/Desktop/videos/11_WIN_20231010_111425_1_roi.h5')
roi_groups = r.groupby(['index', 'shape-type'])
polygons, _ = time_roi.PolygonROI.extract_polygons(roi_groups)

2. Get time (s) in each ROI for a given dataframe

In [16]:
time_in_roi = time_roi.TimeinRoi(fps=15)
for poly in polygons:
    time_in_roi.add_roi(poly)

In [17]:
file_path = '/Users/annateruel/Desktop/videos/11_WIN_20231010_111425_1DLC_resnet50_capsaicin_malesOct20shuffle1_200000_filtered.h5'
directory = '/Users/annateruel/Desktop/videos/'

In [18]:
scorer = 'DLC_resnet50_capsaicin_malesOct20shuffle1_200000'
bodypart = 'Neck'

tracking_data = time_in_roi.extract_tracking_data(file_path, scorer, bodypart)

In [None]:
time_spent = time_in_roi.time_in_rois(tracking_data)
time_spent

In [5]:
time_spent = time_in_roi.time_in_rois_dir(directory, scorer, bodypart)

In [3]:
video_dir = '/Users/annateruel/Desktop/videos/'

In [None]:
roi_polygons = {}
for file in os.listdir(video_dir):
    if file.endswith('roi.h5'):
        roi_file_path = os.path.join(video_dir, file)
        print(f"Processing ROI file: {file}")

        try:
            roi_data = pd.read_hdf(roi_file_path)
            roi_groups = roi_data.groupby(['index', 'shape-type'])
            polygons, _ = time_roi.PolygonROI.extract_polygons(roi_groups)
            # Create the key by removing '_roi.h5' from the file name
            key = file.replace('_roi.h5', '')
            roi_polygons[key] = polygons  # Store polygons with the modified key
            print(f"Extracted polygons from {file}: {len(polygons)}")
        except Exception as e:
            print(f"Error processing {file}: {e}")

In [5]:
scorer = 'DLC_resnet50_capsaicin_malesOct20shuffle1_200000'
bodypart = 'Neck'

In [None]:
time_in_roi = time_roi.TimeinRoi(fps=15)
results_df = time_in_roi.time_in_rois_dir(directory=video_dir, rois=roi_polygons, scorer=scorer, body_part=bodypart)
results_df            

In [14]:
output_path = os.path.join(video_dir, "time_in_rois_results.csv")
results_df.to_csv(output_path, index=False)

In [None]:
# Check the keys in the roi_polygons dictionary
print("Keys in roi_polygons dictionary:")
for key in roi_polygons.keys():
    print(key)

# Check if the keys match the expected format
print("\nExpected keys based on tracking data files:")
for filename in os.listdir(video_dir):
    if filename.endswith('filtered.h5'):
        expected_key = filename.replace('_filtered.h5', '')
        print(expected_key)


## TRIALS

Testing some functions here: 

In [None]:
import dlc

In [None]:
bodyparts = ['nose', 'rightear', 'leftear', 'head']
file_path = '/Users/annateruel/sr-ca2+img-anna-2023-11-17/videos/AD22-56-neutro2-1DLC_dlcrnetms5_sr-ca2+imgNov17shuffle1_500000_filtered.h5'
title = 'Demo Plot'
path = '/Users/annateruel/sr-ca2+img-anna-2023-11-17/videos/'  

In [None]:
density_plot = dlc.plotting.TrackingPlot(style='light')


In [None]:
density_plot.plot_directory(path, bodyparts, title)


## Video clustering for labelling

I want to create a function that can analyze a set of videos, perform a form of dimensionality reduction or clustering on them, and then identify which videos are most distinct from each other. This process will involve comparing videos based on the similarity of their pixel values, possibly in grayscale, to group similar videos into clusters. From each cluster, you can then select one representative video for labeling in your DeepLabCut model.

This is a test code to do that. 

In [None]:
import cv2
import numpy as np
import os
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

In [None]:
def load_and_preprocess_video(video_path, frame_count=100):
    """
    This function takes a video path and extracts a fixed number of frames, converts them to grayscale, resizes, and flattens them.
    """    
    cap = cv2.VideoCapture(video_path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            gray_frame = cv2.resize(gray_frame, (100, 100))  # Resize for consistency
            frames.append(gray_frame.flatten())
            if len(frames) == frame_count:
                break
    finally:
        cap.release()
    return np.array(frames)

 The function load_and_preprocess_video you have is designed to process a single video. To create the video_features array that is needed for the rest of the PCA and clustering steps, you will need to call this function on each video in your directory and then average the frames to get a single feature vector for each video.

Here's how you can create a function to load and preprocess features for all videos in a directory, using your existing function:

In [None]:
def load_video_features_from_directory(video_directory, frame_count=100):
    """
    Loads and preprocesses video features for all videos in a directory.
    
    :param video_directory: The path to the directory containing the videos.
    :param frame_count: The number of frames to extract from each video.
    :return: A tuple of an array of video features and a list of video paths.
    """
    video_paths = [os.path.join(video_directory, f) for f in os.listdir(video_directory) if f.endswith('.mp4')]
    video_features = []

    for video_path in video_paths:
        # Use the provided function to load and preprocess the video
        preprocessed_frames = load_and_preprocess_video(video_path, frame_count)
        # Calculate the mean across all frames to get a single feature vector per video
        video_feature = np.mean(preprocessed_frames, axis=0)
        video_features.append(video_feature)

    return np.array(video_features), video_paths


In [None]:
video_directory = "/Users/annateruel/sdt_videos/"
video_features, video_paths = load_video_features_from_directory(video_directory)

The choice of setting the number of clusters to 5 was arbitrary and provided as a starting point for the example. In practice, the optimal number of clusters depends on the specific characteristics and distribution of your data.

To determine the most appropriate number of clusters, you can use several methods:

1. *Elbow Method*: Plot the sum of squared distances of samples to their closest cluster center for a range of number of clusters. Look for the “elbow” where the rate of decrease sharply changes, which can be considered an indicator of the optimal number of clusters.
2. *Silhouette Score*: Calculate the mean silhouette coefficient over all samples. This gives a perspective into the density and separation of the formed clusters. The silhouette score ranges from -1 to 1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.
3. *Gap Statistic*: Compares the total within intra-cluster variation for different values of k with their expected values under null reference distribution of the data. The optimal k is the one that maximizes the gap statistic.
4. *Domain Knowledge*: Sometimes the optimal number of clusters is suggested by the context of the problem or domain expertise.

Determining the best number of clusters for a dataset without domain knowledge can be challenging because it often depends on the context of the data and the goal of the clustering. However, there are several statistical methods that can help you decide. I will outline two of the most commonly used methods:

1. **The elbow method**: This method involves plotting the explained variance as a function of the number of clusters, and picking the elbow of the curve as the number of clusters to use. The idea is to choose a small value of k that still has a low sum of squared distances (inertia).

2. **The shilhouette method**: The silhouette value measures how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from -1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If many points have a high value, the clustering configuration is appropriate. If many points have a low or negative value, the clustering configuration may have too many or too few clusters.

In [None]:
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Function to calculate the Sum of Squared Distances (SSD) for different values of k
def calculate_ssd_for_k(reduced_features, k_range):
    ssd = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_features)
        ssd.append(kmeans.inertia_)
    return ssd

# Function to calculate silhouette scores for different values of k
def calculate_silhouette_for_k(reduced_features, k_range):
    silhouette_scores = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_features)
        score = silhouette_score(reduced_features, kmeans.labels_)
        silhouette_scores.append(score)
    return silhouette_scores

# Define the range of k you want to test
k_range = range(2, 11)

# Calculate SSD and silhouette scores
ssd = calculate_ssd_for_k(reduced_features, k_range)
silhouette_scores = calculate_silhouette_for_k(reduced_features, k_range)

# Plotting the Elbow Method
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(k_range, ssd, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method For Optimal k')

# Plotting the Silhouette Method
plt.subplot(1, 2, 2)
plt.plot(k_range, silhouette_scores, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method For Optimal k')
plt.tight_layout()
plt.show()


In [None]:
import plotly.express as px 

def main(video_directory, n_clusters=15):
    video_paths = [os.path.join(video_directory, f) for f in os.listdir(video_directory) if f.endswith('.mp4')]
    video_features = []

    for video_path in video_paths:
        features = load_and_preprocess_video(video_path)
        video_features.append(np.mean(features, axis=0))  # Mean of frames as the video feature

    video_features = np.array(video_features)

    # Use 3 PCA components for 3D visualization
    pca = PCA(n_components=3)
    reduced_features = pca.fit_transform(video_features)

    # KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(reduced_features)
    labels = kmeans.labels_

    # Create a 3D scatter plot using Plotly
    fig = px.scatter_3d(
        reduced_features, x=0, y=1, z=2,
        color=labels,
        labels={'0': 'PCA Component 1', '1': 'PCA Component 2', '2': 'PCA Component 3'},
        title='PCA Clustering of Videos'
    )

    # Customize the plotly figure to display video names on hover
    hover_texts = [os.path.basename(video_path) for video_path in video_paths]
    fig.update_traces(marker_size=8, hoverinfo='text', text=hover_texts)

    fig.show()

    # Select one video from each cluster
    representative_videos = []
    for i in range(n_clusters):
        cluster_indices = np.where(labels == i)[0]
        representative_videos.append(video_paths[cluster_indices[0]])  # Select the first video of each cluster for simplicity

    print("Representative Videos:", representative_videos)

if __name__ == "__main__":
    main("/Users/annateruel/sdt_videos/")


## trying lr and loss plotting

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data_path = '/Users/annateruel/Desktop/learning_stats2.csv' 
df = pd.read_csv(data_path, sep=';', decimal=',', engine='python')
df

In [None]:
plt.style.use("dark_background")

# Create a figure and a set of subplots
fig, ax1 = plt.subplots()

# Plot the loss on the primary y-axis
ax1.plot(df['iterations'], df['loss'], color='orange', label='Loss')
ax1.set_xlabel('Iterations')
ax1.set_ylabel('Loss', color='orange')
ax1.tick_params(axis='y', labelcolor='orange')

# Create a second y-axis for the learning rate
ax2 = ax1.twinx()
ax2.plot(df['iterations'], df['lr'], color='purple', label='Learning Rate')
ax2.set_ylabel('Learning Rate', color='purple')
ax2.tick_params(axis='y', labelcolor='purple')

# Show the legend
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

# Show the plot
plt.title('Loss and Learning Rate Through Number of Iterations')
plt.show()

In [None]:
import pandas as pd
pd.read_hdf('/Users/annateruel/Downloads/CollectedData_Paola.h5')