In [10]:
# ----------------------------------------------------
# COLAB CELL 1: SETUP AND INITIALIZATION (FIXED)
# ----------------------------------------------------

# 1. Force reinstall a compatible version of Pillow (e.g., 9.5.0)
# This resolves the conflict that caused the '_Ink' ImportError.
!pip install -q Pillow==9.5.0

# 2. Reinstall other packages to ensure dependencies are fully resolved
# Use '-U --no-deps' for the second pass to avoid breaking Pillow again,
# but ensure the core libraries are present.
!pip install -q -U google-genai ultralytics opencv-python

# Import the necessary libraries
import os
import json
import numpy as np
import cv2
import time
import datetime
from ultralytics import YOLO
from PIL import Image # Should now import correctly
from google import genai
from google.genai import types

print("Pillow version fixed and models initialized.")

# --- USER CONFIGURATION and GLOBAL SETUP (Paste your original configuration here) ---

# 1. Video File Path (MUST MATCH the filename in your Colab files list)
VIDEO_FILE_PATH = "/content/02.mp4"

# 2. PASTE YOUR VALID GEMINI API KEY HERE
GEMINI_API_KEY = "<your api key>"

# --- HEURISTIC TUNING (Copy-paste your variables here) ---
FALL_THRESHOLD_Y = 0.75
MIN_HEIGHT_RATIO = 1.0
CRITICAL_WINDOW_SIZE = 8
ROUTINE_INTERVAL = 5

# --- GLOBAL SETUP ---
try:
    client = genai.Client(api_key=GEMINI_API_KEY)
    pose_model = YOLO('yolov8n-pose.pt')

    # Directory setup
    OUTPUT_DIR = "vlm_output_log"
    VLM_FRAMES_DIR = os.path.join(OUTPUT_DIR, "frames")
    os.makedirs(VLM_FRAMES_DIR, exist_ok=True)
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("✅ All Models and Client Initialized. Ready for Cell 2.")
except Exception as e:
    print(f"🚨 Initialization Error: {e}")
    raise

Pillow version fixed and models initialized.
✅ All Models and Client Initialized. Ready for Cell 2.


In [11]:
# ----------------------------------------------------
# COLAB CELL 2: DETECTION, FILTERING, AND EXTRACTION FUNCTIONS
# ----------------------------------------------------

def analyze_video_for_events(video_path):
    """Scans video using YOLOv8-Pose and heuristic logic to find critical and routine timestamps."""

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        # This will only happen if the video file is missing or corrupted
        raise FileNotFoundError(f"Cannot open video file at {video_path}")

    critical_events_log = []
    routine_timestamps = set()
    last_routine_time = -1

    print("\n--- 1. Starting YOLOv8 Event Detection (GPU Scan) ---")

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        current_time_s = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000

        # --- CRITICAL EVENT (FALL) DETECTION LOGIC ---
        results = pose_model(frame, verbose=False, classes=0) # Only detect people

        if results and len(results[0].boxes) > 0:
            keypoints = results[0].keypoints.xyn

            if len(keypoints[0]) >= 13: # Ensure sufficient keypoints for hip calculation
                hip_y = np.mean(keypoints[0][[11, 12], 1].cpu().numpy())
                box = results[0].boxes[0].xyxy[0].cpu().numpy()
                h_w_ratio = (box[3] - box[1]) / (box[2] - box[0])

                # Heuristic Check: Low Hip AND Horizontal Posture
                if hip_y > FALL_THRESHOLD_Y and h_w_ratio < MIN_HEIGHT_RATIO:
                    critical_events_log.append(current_time_s)

        # --- ROUTINE ACTIVITY SAMPLING ---
        if current_time_s - last_routine_time >= ROUTINE_INTERVAL:
            routine_timestamps.add(current_time_s)
            last_routine_time = current_time_s

        # Optimization: Process every 0.2 seconds to speed up scan
        cap.set(cv2.CAP_PROP_POS_MSEC, (current_time_s + 0.2) * 1000)

    cap.release()

    # --- Final Window Calculation ---
    critical_timestamps = set()
    if critical_events_log:
        first_detection = min(critical_events_log)
        last_detection = max(critical_events_log)

        start_time = max(0, first_detection - CRITICAL_WINDOW_SIZE / 2)
        end_time = last_detection + CRITICAL_WINDOW_SIZE / 2

        # Convert window to 1-second timestamps
        for t in range(int(start_time), int(end_time) + 1, 1):
            critical_timestamps.add(t)

        print(f"✅ Fall Incident Window Identified: [{start_time:.2f}s - {end_time:.2f}s]")
    else:
        print("⚠️ No Critical Incidents Detected. Routine log only.")

    all_vlm_timestamps = sorted(list(routine_timestamps.union(critical_timestamps)))
    print(f"Total Frames Filtered for VLM Analysis: {len(all_vlm_timestamps)}")

    return all_vlm_timestamps, critical_timestamps


def extract_keyframes(video_path, timestamps):
    """Extracts frames only at specified timestamps and saves them to disk."""
    cap = cv2.VideoCapture(video_path)
    extracted_files = []

    for t_sec in timestamps:
        cap.set(cv2.CAP_PROP_POS_MSEC, t_sec * 1000)
        ret, frame = cap.read()

        if ret:
            # Use fixed naming convention for later VLM processing
            filename = os.path.join(VLM_FRAMES_DIR, f"frame_{t_sec:.2f}s.jpg")
            cv2.imwrite(filename, frame)
            extracted_files.append((t_sec, filename))

    cap.release()
    print(f"✅ Extracted {len(extracted_files)} keyframes to '{VLM_FRAMES_DIR}'.")
    return extracted_files

In [12]:
# ----------------------------------------------------
# COLAB CELL 3: VLM CAPTIONING AND FINAL FUSION FUNCTIONS
# ----------------------------------------------------

def generate_vlm_captions(extracted_frames, critical_timestamps):
    """Sends filtered keyframes to Gemini VLM for description and stores the log."""
    MODEL_NAME = "gemini-2.5-flash"
    VISUAL_LOG_DATA = []

    print("\n--- 2. Starting VLM Caption Generation (API Calls) ---")

    for t_sec, frame_path in extracted_frames:
        try:
            img = Image.open(frame_path)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            # ----------------------------------------------------

            is_critical = int(t_sec) in critical_timestamps
            log_type = 'CRITICAL' if is_critical else 'ROUTINE'

            if is_critical:
                prompt_text = (
                    f"TIME {t_sec:.2f}s (CRITICAL INCIDENT): You are a forensic analyst. "
                    "Detail the patient's exact posture, direction, and position relative to the bed/floor. "
                    "Strictly report on activity and objects only."
                )
            else:
                prompt_text = (
                    f"TIME {t_sec:.2f}s (ROUTINE CHECK): Describe the patient's primary status (sleeping/sitting/resting) and "
                    "the overall room environment in one concise, objective sentence."
                )

            response = client.models.generate_content(
                model=MODEL_NAME,
                contents=[prompt_text, img],
                config=types.GenerateContentConfig(temperature=0.2)
            )

            VISUAL_LOG_DATA.append({
                'timestamp': t_sec,
                'type': log_type,
                'description': response.text.strip()
            })

        except Exception as e:
            print(f"🚨 Error processing VLM frame at {t_sec:.2f}s: {e}")

    return VISUAL_LOG_DATA


def generate_final_report(visual_log):
    """Formats the visual log into the final archival report using the LLM."""

    visual_text = "\n".join([f"[{item['timestamp']:.2f}s, {item['type']}]: {item['description']}" for item in visual_log])

    full_prompt = (
        "You are a Geriatric Incident Analyst. Your task is to generate a formal, structured daily archival log "
        "based *only* on the visual event log provided below. The goal is to provide a comprehensive summary "
        "for medical auditing. \n\n"
        "**Strict Output Format (Use only these two markdown headers):**\n"
        "1. ## CRITICAL INCIDENT REPORT ##: Generate a detailed narrative for all events marked 'CRITICAL'. Stitch sequential visual observations into a continuous, minute-by-minute account.\n"
        "2. ## ROUTINE ACTIVITY LOG ##: Generate a chronological log of all routine activity. Group continuous, long periods of activity (like sleeping) into single, time-block paragraphs.\n\n"
        "--- VISUAL EVENT LOG ---\n"
        f"{visual_text}\n"
        "--- END DATA ---"
    )

    print("\n--- 3. Starting Final Report Generation (LLM Reasoning) ---")
    report_response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=full_prompt,
        config=types.GenerateContentConfig(temperature=0.1)
    )

    return report_response.text.strip()

In [13]:
# ----------------------------------------------------
# COLAB CELL 4: MAIN EXECUTION BLOCK
# ----------------------------------------------------

def main():
    """Runs the entire vision pipeline."""
    start_time = time.time()

    # 1. Detection and Sampling
    all_vlm_timestamps, critical_timestamps = analyze_video_for_events(VIDEO_FILE_PATH)

    if not all_vlm_timestamps:
        print("🛑 PROCESS HALTED: No frames were selected for analysis.")
        return

    # 2. Keyframe Extraction
    extracted_frames = extract_keyframes(VIDEO_FILE_PATH, all_vlm_timestamps)

    # 3. VLM Captioning
    visual_log_data = generate_vlm_captions(extracted_frames, critical_timestamps)

    # 4. Save intermediate JSON log (Optional but useful for debugging)
    LOG_FILENAME_FINAL = os.path.join(OUTPUT_DIR, "visual_log_intermediate.json")
    with open(LOG_FILENAME_FINAL, 'w') as f:
        json.dump(visual_log_data, f, indent=4)

    # 5. Final Report Generation (Fusion)
    final_report_text = generate_final_report(visual_log_data)

    # 6. Display and Save Final Output
    print("\n" + "=" * 80)
    print("             FINAL VISION-ONLY DAILY ARCHIVAL LOG")
    print("=" * 80)
    # Print the synthesized report text
    print(final_report_text)
    print("=" * 80)

    archive_filename = "Vision_Only_Archival_Log.txt"
    with open(archive_filename, "w", encoding="utf-8") as f:
        f.write(final_report_text)

    # Download the file
    from google.colab import files
    files.download(archive_filename)

    end_time = time.time()
    print(f"\n✅ PROJECT SUCCESS. Total Runtime: {(end_time - start_time):.2f} seconds.")


# --- ACTIVATE THE PIPELINE ---
if __name__ == "__main__":
    main()


--- 1. Starting YOLOv8 Event Detection (GPU Scan) ---
✅ Fall Incident Window Identified: [0.00s - 12.24s]
Total Frames Filtered for VLM Analysis: 14
✅ Extracted 10 keyframes to 'vlm_output_log/frames'.

--- 2. Starting VLM Caption Generation (API Calls) ---

--- 3. Starting Final Report Generation (LLM Reasoning) ---

             FINAL VISION-ONLY DAILY ARCHIVAL LOG
## CRITICAL INCIDENT REPORT ##

At **0.00s**, the patient was observed standing upright on a red and white checkered rug. Their head and upper torso were slightly inclined forward, with both arms positioned downwards, the left arm exhibiting a slight bend at the elbow. The patient was facing towards the right side of the image, generally oriented towards the bed and the window beyond it, appearing to be in a standing or paused walking stance. They were positioned to the left of the bed, not in direct physical contact with it. The patient was wearing a light brown/khaki short-sleeved t-shirt and dark blue/black trousers.



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ PROJECT SUCCESS. Total Runtime: 189.56 seconds.
