In [None]:
!pip install torchaudio librosa
from google.colab import files
import torch
import torch.nn as nn
import torchaudio
import librosa
import numpy as np
from groq import Groq
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
client = Groq(api_key=GROQ_API_KEY)

print(" Upload your audio file (.wav, .mp3, etc.)")
uploaded = files.upload()

audio_path = list(uploaded.keys())[0]

class AudioCNNRNN(nn.Module):
    def __init__(self, lstm_hidden_size=128, num_classes=2):
        super(AudioCNNRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden_size, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_size, num_classes)

    def forward(self, x):
        batch_size, seq_len, c, h, w = x.size()
        c_in = x.view(batch_size * seq_len, c, h, w)
        features = self.cnn(c_in)
        features = features.mean(dim=[2, 3])
        features = features.view(batch_size, seq_len, -1)
        lstm_out, _ = self.lstm(features)
        out = self.fc(lstm_out[:, -1, :])
        return out

def extract_mel_spectrogram(audio_path, sr=16000, n_mels=64):
    waveform, sample_rate = librosa.load(audio_path, sr=sr)
    mel_spec = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

def slice_spectrogram(mel_spec, slice_size=128, step=64):
    slices = []
    for start in range(0, mel_spec.shape[1] - slice_size, step):
        slice_ = mel_spec[:, start:start + slice_size]
        slices.append(slice_)
    return slices

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AudioCNNRNN()
model.eval()
model.to(device)

mel_spec = extract_mel_spectrogram(audio_path)
mel_slices = slice_spectrogram(mel_spec, slice_size=128, step=64)

if len(mel_slices) == 0:
    raise RuntimeError("No mel slices generated. Check audio length.")

tensor_slices = [torch.tensor(s).unsqueeze(0) for s in mel_slices]
data = torch.stack(tensor_slices)
data = data.unsqueeze(0)
data = data.to(device)

with torch.no_grad():
    outputs = model(data)
    logits = outputs

temperature = 3.0
probabilities = torch.nn.functional.softmax(logits / temperature, dim=-1)

ai_probability = probabilities[0][0].item()
human_probability = probabilities[0][1].item()

diff = abs(ai_probability - human_probability)
if diff >= 0.7:
    confidence = "High"
elif diff >= 0.3:
    confidence = "Medium"
else:
    confidence = "Low"

prompt = f"""
You are an AI audio analysis expert.
The detector outputs:
- AI-generated probability: {ai_probability:.4f}
- Human-generated probability: {human_probability:.4f}
- Confidence level: {confidence}

Give a short, human-readable explanation (1-2 sentences) of why the audio was likely classified as {'AI-generated' if ai_probability > human_probability else 'human-generated'}.
Base it on audio cues such as tone, pitch patterns, unnatural pauses, synthesis artifacts, or other hints you might infer.
Avoid repeating probabilities; focus on the reasoning.
"""

response = client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.6,
)
explanation = response.choices[0].message.content.strip()
print(f" AI-generated Probability: {ai_probability:.4f}")
print(f" Confidence Level: {confidence}")
print(f" Explanation: {explanation}")
