In [1]:
!pip install librosa soundfile ipywidgets --quiet

import librosa
import numpy as np
import soundfile as sf
from google.colab import files
from ipywidgets import interact, FloatSlider
from IPython.display import Audio

# --- Upload files ---
print("Upload first audio file (voice or piano):")
uploaded1 = files.upload()
fname1 = list(uploaded1.keys())[0]

print("Upload second audio file (voice or piano):")
uploaded2 = files.upload()
fname2 = list(uploaded2.keys())[0]

# --- Load audio ---
audio1, sr1 = librosa.load(fname1, sr=None, mono=True)
audio2, sr2 = librosa.load(fname2, sr=None, mono=True)

# Resample if needed
if sr1 != sr2:
    audio2 = librosa.resample(audio2, sr2, sr1)
sr = sr1

# Align lengths
min_len = min(len(audio1), len(audio2))
audio1 = audio1[:min_len]
audio2 = audio2[:min_len]

# --- Fusion function with detuning and alpha modulation ---
def spectral_fusion(alpha=0.5, detune_cents=2.0, modulation_speed=0.2):
    # FFT
    fft1 = np.fft.fft(audio1)
    fft2 = np.fft.fft(audio2)

    mag1, mag2 = np.abs(fft1), np.abs(fft2)
    phase1, phase2 = np.angle(fft1), np.angle(fft2)

    # Detune second audio slightly
    freqs = np.fft.fftfreq(len(audio2), 1/sr)
    detune_factor = 2 ** (detune_cents / 1200.0)  # cents to ratio
    detuned_phase2 = phase2 * detune_factor

    # Dynamic alpha (slow modulation)
    t = np.linspace(0, 1, len(audio1))
    alpha_mod = alpha + 0.1 * np.sin(2*np.pi*modulation_speed*t)
    alpha_mod = np.clip(alpha_mod, 0, 1)

    # Blend magnitudes and phases
    blended_mag = (1-alpha_mod)*mag1 + alpha_mod*mag2
    blended_phase = (1-alpha_mod)*phase1 + alpha_mod*detuned_phase2

    # Reconstruct audio
    blended_fft = blended_mag * np.exp(1j*blended_phase)
    blended_audio = np.fft.ifft(blended_fft).real

    # Normalize
    blended_audio /= np.max(np.abs(blended_audio)) + 1e-9
    display(Audio(blended_audio, rate=sr))

# --- Interactive sliders ---
interact(
    spectral_fusion,
    alpha=FloatSlider(value=0.5, min=0, max=1, step=0.01, description='Fusion'),
    detune_cents=FloatSlider(value=2.0, min=0, max=10, step=0.1, description='Detune (cents)'),
    modulation_speed=FloatSlider(value=0.2, min=0, max=2.0, step=0.05, description='Alpha mod speed')
)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.6 MB[0m [31m17.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m30.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hUpload first audio file (voice or piano):


KeyboardInterrupt: 