# Testing Jhana AI Pipeline

# Test Pipeline

In [None]:
!pip install numpy==1.26.4
!pip install ollama==0.1.7
!pip install openai_whisper==20231117
!pip install scipy==1.12.0
!pip install sounddevice==0.4.6
!pip install torch==2.2.0
!pip install torchaudio==2.2.0
!pip install TTS==0.22.0


In [1]:
import os
import numpy as np
import sounddevice as sd
from scipy.io.wavfile import write
import whisper
from IPython.display import Audio
import ollama
import torch
from TTS.api import TTS

# Function to record audio
def record_audio(duration=5, fs=44100):
    print("Recording for {} seconds...".format(duration))
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=2, dtype='float64')
    sd.wait()  # Wait until recording is finished
    recording = np.int16(recording / np.max(np.abs(recording)) * 32767)  # Convert to int16
    return recording, fs

# Record audio
output_directory = "../data/input/audio/speech_to_transcribe"
os.makedirs(output_directory, exist_ok=True)
audio, fs = record_audio(duration=5)
audio_file_path = os.path.join(output_directory, "my_voice_recording.wav")
write(audio_file_path, fs, audio)
print(f"Recording saved to {audio_file_path}")

# Convert speech to text
model = whisper.load_model("small")
result = model.transcribe(audio_file_path, language="en")
transcribed_text = result["text"]
print("Transcribed text:", transcribed_text)

# Chat with Ollama
ollama_response = ollama.chat(model='mixtral:8x7b-instruct-v0.1-q4_0', messages=[{'role': 'user', 'content': transcribed_text}])
ollama_text = ollama_response['message']['content']
print("Ollama response:", ollama_text)

# Save Ollama's response as text
output_text_directory = "../data/output/text/"
os.makedirs(output_text_directory, exist_ok=True)
text_file_path = os.path.join(output_text_directory, "ollama_response.txt")
with open(text_file_path, "w") as text_file:
    text_file.write(ollama_text)
print(f"Ollama's response saved to {text_file_path}")

# Convert Ollama's response to speech
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)  # Adjust model as needed
output_audio_directory = "../data/output/audio/"
os.makedirs(output_audio_directory, exist_ok=True)
output_file_path = os.path.join(output_audio_directory, "ollama_response.wav")
tts.tts_to_file(text=ollama_text, file_path=output_file_path)
print(f"Text-to-speech audio saved to {output_file_path}")

# Play the generated speech
if os.path.exists(output_file_path):
    display(Audio(output_file_path))
else:
    print("Audio file not found.")


Recording for 5 seconds...
Recording saved to ../data/input/audio/speech_to_transcribe/my_voice_recording.wav
Transcribed text:  Please guide me in a Jhana meditation.
Ollama response:  Sure, I'd be happy to help you with that! Jhana meditation, also known as deep concentration or absorption meditation, is a practice that involves focusing your mind on a single object or sensation to the exclusion of all else. Here are some steps you can follow to practice Jhana meditation:

1. Find a quiet and comfortable place to sit where you won't be disturbed.
2. Set a timer for your meditation session. It's recommended to start with shorter periods of time, such as 10-15 minutes, and gradually increase the length of your sessions as you become more experienced.
3. Sit in a comfortable position with your back straight. You can sit on a cushion or chair, whichever is most comfortable for you.
4. Close your eyes and take a few deep breaths to help yourself relax.
5. Begin to focus your attention on 

# Test Pipeline with Voice Cloning

In [2]:
import os
import numpy as np
import sounddevice as sd
from scipy.io.wavfile import write
import whisper
from IPython.display import Audio
import ollama
import torch
from TTS.api import TTS

# Function to record audio
def record_audio(duration=5, fs=44100):
    print("Recording for {} seconds...".format(duration))
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=2, dtype='float64')
    sd.wait()  # Wait until recording is finished
    recording = np.int16(recording / np.max(np.abs(recording)) * 32767)  # Convert to int16
    return recording, fs

# Record audio
output_directory = "../data/input/audio/speech_to_transcribe"
os.makedirs(output_directory, exist_ok=True)
audio, fs = record_audio(duration=5)
audio_file_path = os.path.join(output_directory, "my_voice_recording.wav")
write(audio_file_path, fs, audio)
print(f"Recording saved to {audio_file_path}")

# Convert speech to text
model = whisper.load_model("small")
result = model.transcribe(audio_file_path, language="en")
transcribed_text = result["text"]
print("Transcribed text:", transcribed_text)

# Chat with Ollama
ollama_response = ollama.chat(model='mixtral:8x7b-instruct-v0.1-q4_0', messages=[{'role': 'user', 'content': transcribed_text}])
ollama_text = ollama_response['message']['content']
print("Ollama response:", ollama_text)

# Save Ollama's response as text
output_text_directory = "../data/output/text/"
os.makedirs(output_text_directory, exist_ok=True)
text_file_path = os.path.join(output_text_directory, "ollama_response.txt")
with open(text_file_path, "w") as text_file:
    text_file.write(ollama_text)
print(f"Ollama's response saved to {text_file_path}")

# Convert Ollama's response to speech
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)  # Adjust model as needed
output_audio_directory = "../data/output/audio/"
os.makedirs(output_audio_directory, exist_ok=True)
output_file_path = os.path.join(output_audio_directory, "ollama_response.wav")
tts.tts_to_file(text=ollama_text, file_path=output_file_path, language="en", speaker_wav="../data/input/audio/voices_to_clone/audio_cf_10_seconds.wav")
print(f"Text-to-speech audio saved to {output_file_path}")

# Play the generated speech
if os.path.exists(output_file_path):
    display(Audio(output_file_path))
else:
    print("Audio file not found.")


Recording for 5 seconds...
Recording saved to ../data/input/audio/speech_to_transcribe/my_voice_recording.wav
Transcribed text:  Hi, Jana. Can you tell me a poo joke, please?
Ollama response:  Sure, here's a poo joke for you:

Why did the poo roll down the hill?

To become a poo-tato!
Ollama's response saved to ../data/output/text/ollama_response.txt
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts
 > Text splitted to sentences.
["Sure, here's a poo joke for you:", 'Why did the poo roll down the hill?', 'To become a poo-tato!']
 > Processing time: 3.994328022003174
 > Real-time factor: 0.5315966494759173
Text-to-speech audio saved to ../data/output/audio/ollama_response.wav
