In [None]:
# Define the path to your original file and the output file
input_file_path = '/content/Disease_script.txt'
output_file_path = '/content/Trimmed_Filtered_Disease_script.txt'

# Function to process the file and extract 50% of sentences for the first 500 diseases
def process_file(input_file, output_file):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # We now consider only the first 500 diseases, each with 10 sentences, thus 5000 lines in total
    lines = lines[:5000]

    # Assuming each disease has 10 sentences, we take 5 out of 10 for each
    processed_lines = []
    for i in range(0, len(lines), 10):  # Iterate through the file 10 lines at a time
        selected_lines = lines[i:i+10][:5]  # Select the first 5 lines out of each set of 10
        processed_lines.extend(selected_lines)

    # Write the selected lines to a new file
    with open(output_file, 'w') as file:
        file.writelines(processed_lines)

# Call the function with the paths to your input and output files
process_file(input_file_path, output_file_path)

print("Processing completed. The trimmed and filtered sentences are written to", output_file_path)


In [None]:
# Install required libraries
!pip install requests

import requests
import json
import os
import shutil
import base64
from google.colab import drive
import zipfile

# Function to synthesize text and save to respective folders
def synthesize_text_with_api_key(text, voice_name, language_code, gender, output_file, api_key):
    url = "https://texttospeech.googleapis.com/v1/text:synthesize"
    headers = {
        "Content-Type": "application/json; charset=utf-8"
    }
    data = {
        "input": {"text": text},
        "voice": {
            "languageCode": language_code,
            "name": voice_name,
            "ssmlGender": gender
        },
        "audioConfig": {
            "audioEncoding": "LINEAR16"
        }
    }

    response = requests.post(url, headers=headers, params={"key": api_key}, json=data)
    if response.status_code == 200:
        audio_content = response.json()['audioContent']
        with open(output_file, "wb") as out:
            out.write(base64.b64decode(audio_content))
        print(f"Generated audio file: {output_file}")
    else:
        print(f"Error in generating audio file for {output_file}: {response.text}")

# Mount Google Drive
drive.mount('/content/drive')

# Create directories for male and female voices
os.makedirs('male_voices', exist_ok=True)
os.makedirs('female_voices', exist_ok=True)

# Your API key
api_key = "AIzaSyCmZgZ0eSs6xIp-fymgw2v5sj-0dCiB2RA"

# Define different voices and accents
# Define different voices and accents
voices = [
    {"voice_name": "en-GB-Wavenet-A", "language_code": "en-GB", "gender": "MALE", "folder": "male_voices"},
    {"voice_name": "en-GB-Wavenet-F", "language_code": "en-GB", "gender": "FEMALE", "folder": "female_voices"},
    #{"voice_name": "en-AU-Wavenet-A", "language_code": "en-AU", "gender": "MALE", "folder": "male_voices"},
    #{"voice_name": "en-AU-Wavenet-C", "language_code": "en-AU", "gender": "FEMALE", "folder": "female_voices"},
    {"voice_name": "en-US-Wavenet-A", "language_code": "en-US", "gender": "MALE", "folder": "male_voices"},
    {"voice_name": "en-US-Wavenet-F", "language_code": "en-US", "gender": "FEMALE", "folder": "female_voices"},
    # Add more voices and accents as needed
]


# Initialize lists for audio paths and text data
audio_paths = []
text_data = []

# Read sentences from file
#with open('/content/Filtered_Disease_script.txt', 'r') as file:
with open('/content/Trimmed_Filtered_Disease_script.txt', 'r') as file:
    sentences = file.readlines()

print("Starting text-to-speech synthesis...")

# Loop over sentences and synthesize speech
for i, sentence in enumerate(sentences):
    for voice in voices:
        unique_id = f"{i}_{voice['voice_name']}"
        output_file = f'{voice["folder"]}/output_{unique_id}.wav'
        synthesize_text_with_api_key(sentence.strip(), voice["voice_name"], voice["language_code"], voice["gender"], output_file, api_key)
        audio_paths.append(f"{unique_id} /content/{output_file}")
        text_data.append(f"{unique_id} {sentence.strip()}")

print("Text-to-speech synthesis completed. Writing audio paths and text data to files...")

# Write the audio paths and text data to files
with open('audio_paths.txt', 'w') as file:
    for line in audio_paths:
        file.write(f"{line}\n")

with open('text.txt', 'w') as file:
    for line in text_data:
        file.write(f"{line}\n")

print("Files written. Zipping folders...")



In [None]:

# Create a zip file
zipf = zipfile.ZipFile('voices_final_trimmed_2.zip', 'w', zipfile.ZIP_DEFLATED)
for root, dirs, files in os.walk('male_voices'):
    for file in files:
        zipf.write(os.path.join(root, file))
for root, dirs, files in os.walk('female_voices'):
    for file in files:
        zipf.write(os.path.join(root, file))
zipf.close()

print("Zipping complete. Uploading to Google Drive...")

# Specify your Google Drive path where you want to save the zip file
drive_path = '/content/drive/My Drive/Projects/AudioFiles_final/'  # Modify this path

# Ensure the directory exists
os.makedirs(drive_path, exist_ok=True)

# Copy the zip file to Google Drive
destination_path = os.path.join(drive_path, 'voices_final_trimmed_2.zip')
shutil.copy('voices_final_trimmed_2.zip', destination_path)

print(f"Upload complete. File saved to Google Drive at: {destination_path}")
