# Figure 7 Behavioral Data Demonstration

This notebook demonstrates how to access and analyze the behavioral data from Figure 7 of the Zhai et al. 2025 paper after running the conversion script.

The behavioral data includes AIM (Abnormal Involuntary Movement) scoring for dyskinesia assessment in CDGI knockout mice.

In [None]:
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pynwb import NWBHDF5IO

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Path to the NWB files
nwb_dir = Path('../nwb_files/figure_7_behavioral')
print(f"Looking for NWB files in: {nwb_dir}")
print(f"Directory exists: {nwb_dir.exists()}")

## 1. Load and Examine Behavioral Data

First, let's load all the behavioral NWB files and examine their structure.

In [None]:
# Find all NWB files
nwb_files = list(nwb_dir.glob('*.nwb'))
print(f"Found {len(nwb_files)} NWB files")

# Display first few files
for i, file in enumerate(nwb_files[:5]):
    print(f"  {i+1}. {file.name}")
    
if len(nwb_files) > 5:
    print(f"  ... and {len(nwb_files) - 5} more files")

In [None]:
# Load and examine a sample NWB file
sample_file = nwb_files[0]
print(f"Examining sample file: {sample_file.name}")

with NWBHDF5IO(str(sample_file), 'r') as io:
    nwbfile = io.read()
    
    print(f"\nNWB File Information:")
    print(f"  Session description: {nwbfile.session_description}")
    print(f"  Subject: {nwbfile.subject.subject_id} ({nwbfile.subject.genotype})")
    print(f"  Session start time: {nwbfile.session_start_time}")
    
    print(f"\nProcessing modules:")
    for module_name, module in nwbfile.processing.items():
        print(f"  {module_name}: {module.description}")
        
        for data_interface_name, data_interface in module.data_interfaces.items():
            print(f"    {data_interface_name}:")
            
            if hasattr(data_interface, 'time_series'):
                for ts_name, ts in data_interface.time_series.items():
                    print(f"      {ts_name}: {ts.description[:100]}...")
                    print(f"        Data shape: {ts.data.shape}")
                    print(f"        Timestamps: {ts.timestamps[:3]}... (first 3)")
    
    print(f"\nTime intervals:")
    for interval_name, interval in nwbfile.intervals.items():
        print(f"  {interval_name}: {interval.description}")
        print(f"    Number of intervals: {len(interval)}")

## 2. Extract AIM Scoring Data

Let's extract the AIM scoring data from all files and organize it by genotype.

In [None]:
# Extract AIM data from all files
aim_data = defaultdict(lambda: defaultdict(list))
animal_info = {}

for nwb_file in nwb_files:
    with NWBHDF5IO(str(nwb_file), 'r') as io:
        nwbfile = io.read()
        
        # Get subject information
        subject_id = nwbfile.subject.subject_id
        genotype = nwbfile.subject.genotype
        session_date = nwbfile.session_start_time.strftime('%Y-%m-%d')
        
        animal_info[subject_id] = {
            'genotype': genotype,
            'session_date': session_date,
            'file_name': nwb_file.name
        }
        
        # Extract AIM scores
        if 'behavior' in nwbfile.processing:
            behavior_module = nwbfile.processing['behavior']
            
            if 'AIM_scoring' in behavior_module.data_interfaces:
                aim_scoring = behavior_module.data_interfaces['AIM_scoring']
                aim_scores_ts = aim_scoring.time_series['AIM_scores']
                
                # Get data and timestamps
                scores_data = aim_scores_ts.data[:]
                timestamps = aim_scores_ts.timestamps[:]
                
                # Convert timestamps to minutes
                time_points = timestamps / 60.0
                
                # Store data organized by genotype
                for i, time_point in enumerate(time_points):
                    if i < len(scores_data):
                        score_row = scores_data[i]
                        # score_row columns: [axial, limb, orolingual, total]
                        
                        aim_data[genotype]['time_points'].append(time_point)
                        aim_data[genotype]['axial'].append(score_row[0])
                        aim_data[genotype]['limb'].append(score_row[1])
                        aim_data[genotype]['orolingual'].append(score_row[2])
                        aim_data[genotype]['total'].append(score_row[3])
                        aim_data[genotype]['animal_id'].append(subject_id)
                        aim_data[genotype]['session_date'].append(session_date)

# Convert to regular dict and show summary
aim_data = dict(aim_data)
print(f"Extracted AIM data for {len(aim_data)} genotypes:")
for genotype, data in aim_data.items():
    print(f"  {genotype}: {len(data['total'])} data points from {len(set(data['animal_id']))} animals")

## 3. Reproduce Figure 7J: AIM Scoring Analysis

Now let's reproduce the analysis from Figure 7J, showing AIM scores across different categories and genotypes.

In [None]:
# Calculate statistics for each genotype and category
categories = ['total', 'axial', 'limb', 'orolingual']
stats_data = []

for genotype, data in aim_data.items():
    for category in categories:
        scores = np.array(data[category])
        # Remove missing values (-1)
        valid_scores = scores[scores >= 0]
        
        if len(valid_scores) > 0:
            stats_data.append({
                'genotype': genotype,
                'category': category,
                'mean': np.mean(valid_scores),
                'std': np.std(valid_scores),
                'sem': np.std(valid_scores) / np.sqrt(len(valid_scores)),
                'n': len(valid_scores),
                'scores': valid_scores
            })

# Convert to DataFrame for easier manipulation
stats_df = pd.DataFrame(stats_data)
print("AIM Scoring Statistics:")
print(stats_df.pivot(index='category', columns='genotype', values='mean').round(2))

In [None]:
# Create Figure 7J reproduction
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
fig.suptitle('Figure 7J: AIM Scoring in CDGI Knockout Mice', fontsize=16, fontweight='bold')

# Define colors for genotypes
colors = {'CDGI_KO': '#2E86AB', 'WT': '#A23B72', 'unknown': '#F18F01'}

for i, category in enumerate(categories):
    ax = axes[i]
    
    # Get data for this category
    category_data = stats_df[stats_df['category'] == category]
    
    # Plot bars
    genotypes = category_data['genotype'].values
    means = category_data['mean'].values
    sems = category_data['sem'].values
    
    bars = ax.bar(genotypes, means, yerr=sems, capsize=5, 
                  color=[colors.get(g, '#808080') for g in genotypes],
                  alpha=0.7, edgecolor='black', linewidth=1)
    
    # Add individual data points
    for j, (genotype, row) in enumerate(category_data.iterrows()):
        scores = row['scores']
        # Add some jitter to x-coordinates for better visualization
        x_jitter = np.random.normal(j, 0.05, len(scores))
        ax.scatter(x_jitter, scores, color='black', alpha=0.4, s=20)
    
    # Customize subplot
    ax.set_title(category.capitalize(), fontsize=14, fontweight='bold')
    ax.set_ylabel('AIM Score' if i == 0 else '', fontsize=12)
    ax.set_xlabel('Genotype', fontsize=12)
    ax.set_ylim(0, max(means) * 1.2)
    ax.grid(True, alpha=0.3)
    
    # Add sample size annotations
    for j, (genotype, row) in enumerate(category_data.iterrows()):
        ax.text(j, max(means) * 1.1, f'n={row["n"]}', 
                ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

## 4. Time Course Analysis

Let's analyze how AIM scores change over time following L-DOPA administration.

In [None]:
# Prepare data for time course analysis
time_course_data = []

for genotype, data in aim_data.items():
    for i in range(len(data['total'])):
        if data['total'][i] >= 0:  # Valid score
            time_course_data.append({
                'genotype': genotype,
                'time_point': data['time_points'][i],
                'total_score': data['total'][i],
                'axial_score': data['axial'][i],
                'limb_score': data['limb'][i],
                'orolingual_score': data['orolingual'][i],
                'animal_id': data['animal_id'][i]
            })

time_course_df = pd.DataFrame(time_course_data)
print(f"Time course data: {len(time_course_df)} observations")
print(f"Time points: {sorted(time_course_df['time_point'].unique())}")
print(f"Genotypes: {time_course_df['genotype'].unique()}")

In [None]:
# Plot time course
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('AIM Score Time Course Following L-DOPA Administration', fontsize=16, fontweight='bold')

categories = ['total_score', 'axial_score', 'limb_score', 'orolingual_score']
titles = ['Total', 'Axial', 'Limb', 'Orolingual']

for i, (category, title) in enumerate(zip(categories, titles)):
    ax = axes[i//2, i%2]
    
    # Calculate means and SEMs for each time point and genotype
    for genotype in time_course_df['genotype'].unique():
        genotype_data = time_course_df[time_course_df['genotype'] == genotype]
        
        # Group by time point and calculate statistics
        time_stats = genotype_data.groupby('time_point')[category].agg(['mean', 'std', 'count']).reset_index()
        time_stats['sem'] = time_stats['std'] / np.sqrt(time_stats['count'])
        
        # Plot line with error bars
        ax.errorbar(time_stats['time_point'], time_stats['mean'], 
                   yerr=time_stats['sem'], 
                   marker='o', markersize=6, linewidth=2, capsize=5,
                   color=colors.get(genotype, '#808080'), 
                   label=genotype)
    
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.set_xlabel('Time post-L-DOPA (min)', fontsize=12)
    ax.set_ylabel('AIM Score', fontsize=12)
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_xlim(15, 125)
    ax.set_ylim(0, None)

plt.tight_layout()
plt.show()

## 5. Individual Animal Analysis

Let's examine individual animal responses to understand variability within genotypes.

In [None]:
# Create individual animal profiles
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
fig.suptitle('Individual Animal AIM Score Profiles', fontsize=16, fontweight='bold')

for i, genotype in enumerate(['CDGI_KO', 'WT']):
    if genotype in time_course_df['genotype'].unique():
        ax = axes[i]
        genotype_data = time_course_df[time_course_df['genotype'] == genotype]
        
        # Plot each animal's time course
        for animal_id in genotype_data['animal_id'].unique():
            animal_data = genotype_data[genotype_data['animal_id'] == animal_id]
            animal_data = animal_data.sort_values('time_point')
            
            ax.plot(animal_data['time_point'], animal_data['total_score'], 
                   marker='o', alpha=0.6, linewidth=1, markersize=4,
                   label=animal_id)
        
        ax.set_title(f'{genotype} Animals', fontsize=14, fontweight='bold')
        ax.set_xlabel('Time post-L-DOPA (min)', fontsize=12)
        ax.set_ylabel('Total AIM Score', fontsize=12)
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax.grid(True, alpha=0.3)
        ax.set_xlim(15, 125)
        ax.set_ylim(0, None)

plt.tight_layout()
plt.show()

## 6. Summary Statistics

Let's generate summary statistics for the behavioral data.

In [None]:
# Summary statistics table
summary_stats = []

for genotype in time_course_df['genotype'].unique():
    genotype_data = time_course_df[time_course_df['genotype'] == genotype]
    
    # Overall statistics
    n_animals = len(genotype_data['animal_id'].unique())
    n_sessions = len(genotype_data)
    
    # Score statistics
    for category in ['total_score', 'axial_score', 'limb_score', 'orolingual_score']:
        scores = genotype_data[category]
        summary_stats.append({
            'genotype': genotype,
            'category': category.replace('_score', ''),
            'n_animals': n_animals,
            'n_observations': len(scores),
            'mean': np.mean(scores),
            'std': np.std(scores),
            'median': np.median(scores),
            'min': np.min(scores),
            'max': np.max(scores)
        })

summary_df = pd.DataFrame(summary_stats)
print("Summary Statistics:")
print(summary_df.round(2))

## 7. Export Results

Finally, let's export the processed data for further analysis.

In [None]:
# Export processed data
output_dir = Path('../analysis_outputs')
output_dir.mkdir(exist_ok=True)

# Save time course data
time_course_df.to_csv(output_dir / 'figure_7_aim_time_course.csv', index=False)

# Save summary statistics
summary_df.to_csv(output_dir / 'figure_7_aim_summary_stats.csv', index=False)

# Save animal information
animal_info_df = pd.DataFrame.from_dict(animal_info, orient='index')
animal_info_df.to_csv(output_dir / 'figure_7_animal_info.csv')

print(f"Analysis results exported to: {output_dir}")
print(f"Files created:")
for file in output_dir.glob('figure_7_*.csv'):
    print(f"  - {file.name}")

## Conclusion

This notebook demonstrates how to:

1. **Load behavioral NWB files** generated by the conversion script
2. **Extract AIM scoring data** and organize it by genotype
3. **Reproduce Figure 7J** showing dyskinesia scores across categories
4. **Analyze time course data** to understand L-DOPA response dynamics
5. **Examine individual animal variability** within genotypes
6. **Generate summary statistics** for the behavioral data
7. **Export results** for further analysis

The NWB format provides a standardized way to store and access behavioral data, making it easy to reproduce the original analyses and perform new investigations on the CDGI knockout behavioral phenotype.