In [1]:
pip install numpy scipy opencv-python tqdm

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import numpy as np
import cv2
from scipy.io import wavfile
from scipy.signal import stft
from tqdm import tqdm

# === CONFIGURATION ===
AUDIO_FILE = '/Users/yashavula/Downloads/More More More.wav'     # Replace with your audio file path
OUTPUT_VIDEO = '/Users/yashavula/Desktop/Helios/Code/Helios V2/test_1_audio_visualizer.mp4'    # Output video file name
FRAME_DIR = '/Users/yashavula/Desktop/Helios/Code/Helios V2/frames'                     # Temp folder for saving frames
FRAME_HEIGHT, FRAME_WIDTH = 256, 256     # Size of each frame
N_FFT = 1024                             # FFT window size
HOP_LENGTH = 512                         # Hop length for STFT
FRAME_DURATION = 50                      # Number of STFT time slices per frame (~controls fps)

# === LOAD AUDIO ===
print("[INFO] Loading audio...")
sr, y = wavfile.read(AUDIO_FILE)
if y.ndim > 1:
    y = y.mean(axis=1)  # Convert to mono if stereo

# === COMPUTE STFT ===
print("[INFO] Computing spectrogram...")
_, _, Zxx = stft(y, fs=sr, nperseg=N_FFT, noverlap=N_FFT - HOP_LENGTH)
S_db = 20 * np.log10(np.abs(Zxx) + 1e-6)

# === NORMALIZE FOR VISUALIZATION ===
S_db_norm = 255 * (S_db - S_db.min()) / (S_db.max() - S_db.min())
S_db_norm = S_db_norm.astype(np.uint8)

# === GENERATE FRAMES ===
os.makedirs(FRAME_DIR, exist_ok=True)
frames = []
print("[INFO] Generating frames...")
for i in tqdm(range(0, S_db_norm.shape[1] - FRAME_DURATION, FRAME_DURATION)):
    slice_ = S_db_norm[:, i:i + FRAME_DURATION]
    resized = cv2.resize(slice_, (FRAME_WIDTH, FRAME_HEIGHT))
    colored = cv2.applyColorMap(resized, cv2.COLORMAP_MAGMA)
    frame_path = os.path.join(FRAME_DIR, f"frame_{i:05d}.png")
    cv2.imwrite(frame_path, colored)
    frames.append(colored)

# === CREATE VIDEO ===
fps = sr / HOP_LENGTH / FRAME_DURATION
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (FRAME_WIDTH, FRAME_HEIGHT))
print(f"[INFO] Writing video ({len(frames)} frames @ {fps:.2f} fps)...")
for f in frames:
    video.write(f)
video.release()

print(f"[DONE] Saved video to: {OUTPUT_VIDEO}")

[INFO] Loading audio...
[INFO] Computing spectrogram...
[INFO] Generating frames...


100%|██████████| 339/339 [00:00<00:00, 649.08it/s]


[INFO] Writing video (339 frames @ 1.72 fps)...
[DONE] Saved video to: /Users/yashavula/Desktop/Helios/Code/Helios V2/test_1_audio_visualizer.mp4


In [8]:
import os
import numpy as np
import cv2
from scipy.io import wavfile
from scipy.signal import stft
from tqdm import tqdm

AUDIO_FILE = '/Users/yashavula/Downloads/More More More.wav'
FRAME_DIR = '/Users/yashavula/Desktop/Helios/Code/Helios V2/frames'
MAG_VIDEO = "stft_magnitude.mp4"
PHASE_VIDEO = "stft_phase.mp4"

N_FFT = 1024
HOP = 512
FRAME_HEIGHT, FRAME_WIDTH = 256, 256
FRAME_DURATION = 50  # STFT slices per video frame

# Load audio
sr, y = wavfile.read(AUDIO_FILE)
if y.ndim > 1:
    y = y.mean(axis=1)

# Compute STFT
_, _, Zxx = stft(y, fs=sr, nperseg=N_FFT, noverlap=N_FFT - HOP)
magnitude = np.abs(Zxx)
phase = np.angle(Zxx)

# Normalize
mag_norm = 255 * (magnitude - magnitude.min()) / (magnitude.max() - magnitude.min())
phase_norm = 255 * (phase + np.pi) / (2 * np.pi)

mag_norm, phase_norm = mag_norm.astype(np.uint8), phase_norm.astype(np.uint8)

# Write frames
os.makedirs(FRAME_DIR, exist_ok=True)
mag_frames, phase_frames = [], []

for i in tqdm(range(0, mag_norm.shape[1] - FRAME_DURATION, FRAME_DURATION)):
    mag_chunk = mag_norm[:, i:i + FRAME_DURATION]
    phase_chunk = phase_norm[:, i:i + FRAME_DURATION]

    mag_resized = cv2.resize(mag_chunk, (FRAME_WIDTH, FRAME_HEIGHT))
    phase_resized = cv2.resize(phase_chunk, (FRAME_WIDTH, FRAME_HEIGHT))

    mag_frames.append(mag_resized)
    phase_frames.append(phase_resized)

# Write videos
fps = sr / HOP / FRAME_DURATION
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
mag_writer = cv2.VideoWriter(MAG_VIDEO, fourcc, fps, (FRAME_WIDTH, FRAME_HEIGHT), isColor=False)
phase_writer = cv2.VideoWriter(PHASE_VIDEO, fourcc, fps, (FRAME_WIDTH, FRAME_HEIGHT), isColor=False)

for m, p in zip(mag_frames, phase_frames):
    mag_writer.write(m)
    phase_writer.write(p)

mag_writer.release()
phase_writer.release()
print(f"[DONE] Encoded magnitude to {MAG_VIDEO}, phase to {PHASE_VIDEO}")

100%|██████████| 339/339 [00:00<00:00, 23403.33it/s]

[DONE] Encoded magnitude to stft_magnitude.mp4, phase to stft_phase.mp4





In [9]:
import cv2
import numpy as np
from scipy.signal import istft
from scipy.io.wavfile import write

MAG_VIDEO = "stft_magnitude.mp4"
PHASE_VIDEO = "stft_phase.mp4"
RECON_AUDIO = "reconstructed_audio.wav"

N_FFT = 1024
HOP = 512
FRAME_DURATION = 50
FRAME_HEIGHT, FRAME_WIDTH = 256, 256
SR = 44100  # Make sure this matches original

def load_video_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        resized = cv2.resize(gray, (FRAME_DURATION, N_FFT // 2 + 1))
        frames.append(resized)
    cap.release()
    return np.concatenate(frames, axis=1)

# Load videos
mag_stack = load_video_frames(MAG_VIDEO)
phase_stack = load_video_frames(PHASE_VIDEO)

# Denormalize
mag = (mag_stack / 255.0) * 1.0  # use max=1 since STFT was log-scaled
phase = (phase_stack / 255.0) * (2 * np.pi) - np.pi

# Reconstruct STFT
Zxx = mag * np.exp(1j * phase)
_, y = istft(Zxx, fs=SR, nperseg=N_FFT, noverlap=N_FFT - HOP)

# Save
write(RECON_AUDIO, SR, y.astype(np.float32))
print(f"[DONE] Reconstructed audio saved to {RECON_AUDIO}")

OpenCV: Couldn't read video stream from file "stft_magnitude.mp4"


ValueError: need at least one array to concatenate