In [None]:
import argparse
import json
import os
import requests
import re
import pandas as pd
from dotenv import load_dotenv
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from pydub import AudioSegment
from pytube import YouTube
from tqdm import tqdm
from yt_dlp import YoutubeDL
import torch
from transformers import pipeline
# from googletrans import Translator
# from youtube_transcript_api import YouTubeTranscriptApi

In [None]:
# load_dotenv()
# api_key = os.getenv("YOUTUBE_API_KEY")
# youtube = build("youtube", "v3", developerKey=api_key)

### Function to Fetch and Display Video IDs from CSV

This function, `fetch_and_print_video_ids_from_csv`, reads a CSV file containing YouTube video URLs, extracts the video IDs and titles, and displays them in a DataFrame. The function performs the following steps:

1. **Read CSV File**: Reads the specified CSV file into a DataFrame.
2. **Extract Video URLs**: Iterates through each cell in the DataFrame to find YouTube video URLs.
3. **Avoid Duplicates**: Uses a set to keep track of seen URLs to avoid duplicates.
4. **Fetch Video Details**: For each unique video URL, extracts the video ID and title using `yt-dlp`.
5. **Create DataFrame**: Compiles the video IDs, URLs, and titles into a new DataFrame and displays it.
6. **Return DataFrame**: Returns the DataFrame containing video details for further use.

In [4]:
def fetch_video_ids_from_csv(csv_file_path):
    try:
        # Read the CSV file
        df = pd.read_csv(csv_file_path)

        # Extract video URLs from all columns
        video_urls = []
        seen_urls = set()  # To avoid duplicates
        ydl_opts = {
            'quiet': True,
            'skip_download': True,
        }

        for index, row in df.iterrows():
            for column in df.columns:
                cell_value = row[column]
                if isinstance(cell_value, str):
                    video_urls_in_cell = re.findall(
                        r"(https?://www\.youtube\.com/watch\?v=[a-zA-Z0-9_-]{11})", cell_value)
                    for video_url in video_urls_in_cell:
                        if video_url not in seen_urls:
                            seen_urls.add(video_url)
                            video_id_match = re.search(
                                r"v=([a-zA-Z0-9_-]{11})", video_url)
                            if video_id_match:
                                video_id = video_id_match.group(1)

                                # Fetch video title using yt-dlp
                                with YoutubeDL(ydl_opts) as ydl:
                                    info_dict = ydl.extract_info(
                                        video_url, download=False)
                                    video_title = info_dict.get('title', 'Unknown Title').replace(
                                        '/', '_').replace('\\', '_')

                                video_urls.append(
                                    {"ID": video_id, "URL": video_url, "Title": video_title})

        return video_urls
    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")
        return []



def print_video_urls(video_urls):
    df = pd.DataFrame(video_urls)
    display(df)


csv_file_path = "/home/azureuser/cloudfiles/code/Users/Akilesh_Jayakumar/youtube-api/csv-files/New 10hrs From YT.csv"
video_urls = fetch_video_ids_from_csv(csv_file_path)
print_video_urls(video_urls)
print("Hello")
print("Hello")
print("Hello")
print("Hello")

         n = h8nk7FdZbmJEwMu-5l3k ; player = https://www.youtube.com/s/player/f8071a08/player_ias.vflset/en_US/base.js
         n = ERzGwEbeMQPTROHkEtUU ; player = https://www.youtube.com/s/player/f8071a08/player_ias.vflset/en_US/base.js
         n = cUEkKltHNEg6oaZVr8WA ; player = https://www.youtube.com/s/player/f8071a08/player_ias.vflset/en_US/base.js
         n = WGNIvIuKddKQZdcPQFpn ; player = https://www.youtube.com/s/player/f8071a08/player_ias.vflset/en_US/base.js


### Function to Download and Convert Audio from YouTube Videos

This function, `fetch_audio_for_all_videos`, downloads the audio from a list of YouTube video URLs and converts it to WAV format. The function performs the following steps:

1. **Ensure Directory Exists**: Ensures that the specified result directory exists. If it doesn't, it creates the directory.
2. **Iterate Through Video URLs**: Iterates through the provided list of video URLs.
3. **Download and Convert Audio**: For each video URL, uses `yt-dlp` to download the audio in the best available format and converts it to WAV format.
4. **Handle Errors**: Catches and prints any errors that occur during the download process.

In [None]:
def fetch_audio_for_all_videos(video_urls, result_dir):
    # Ensure the result directory exists
    os.makedirs(result_dir, exist_ok=True)

    for video in video_urls:
        video_url = video["URL"]

        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': os.path.join(result_dir, '%(title)s.%(ext)s'),
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '192',
            }],
        }

        try:
            with YoutubeDL(ydl_opts) as ydl:
                info_dict = ydl.extract_info(video_url, download=True)
                title = info_dict.get('title', 'audio').replace(
                    '/', '_').replace('\\', '_')
                wav_path = os.path.join(result_dir, f"{title}.wav")
                print(
                    f"Audio successfully downloaded and converted to WAV: {wav_path}")
        except Exception as e:
            print(f"Error downloading audio for video {video_url}: {e}")


result_dir = "/home/azureuser/cloudfiles/code/Users/Akilesh_Jayakumar/youtube-api/audio"
fetch_audio_for_all_videos(video_urls, result_dir)

## Function to Fetch and Save Transcripts

This function fetches the transcript of a given video and saves it to a file. It supports multiple languages.

In [None]:
# # def fetch_and_save_transcript(video_id, file_name, language):
# #     def picker_trans(language):
# #         if language == "english":
# #             return "en"
# #         elif language == "chinese":
# #             return "zh"
# #         elif language == "malay":
# #             return "ms"
# #     try:
# #         transcript = YouTubeTranscriptApi.get_transcript(
# #             video_id, languages=[picker_trans(language)])
# #     except Exception as e:
# #         print(f"An error occurred: {e}")
# #         return False
# #     with open(file_name, "w", encoding="utf-8") as file:
# #         for line in transcript:
# #             file.write(f"{line['text']}\n")
# #     return True

# # transscript = fetch_and_save_transcript(video_urls[1]["ID"], "transcript.txt", "english")
# # print(transscript)

# def fetch_and_save_transcripts(video_urls, output_dir, language):
#     def pick_lang(language):
#         if language == "english":
#             return "en"
#         elif language == "chinese":
#             return "zh"
#         elif language == "malay":
#             return "ms"
#         elif language == "tamil":
#             return "ta"

#     def format_timestamp(seconds):
#         ms = int((seconds - int(seconds)) * 1000)
#         hours, remainder = divmod(int(seconds), 3600)
#         minutes, seconds = divmod(remainder, 60)
#         return f"{hours:02}:{minutes:02}:{seconds:02},{ms:03}"

#     os.makedirs(output_dir, exist_ok=True)

#     for video in video_urls:
#         video_url = video["URL"]

#         # Extract video ID from URL
#         video_id_match = re.search(r"v=([a-zA-Z0-9_-]{11})", video_url)
#         if not video_id_match:
#             print(f"Invalid video URL: {video_url}")
#             continue

#         video_id = video_id_match.group(1)

#         try:
#             # Extract video info using yt-dlp to get the title
#             ydl_opts = {
#                 'quiet': True,
#                 'skip_download': True,
#             }

#             with YoutubeDL(ydl_opts) as ydl:
#                 info_dict = ydl.extract_info(video_url, download=False)
#                 title = info_dict.get('title', 'transcript').replace(
#                     '/', '_').replace('\\', '_')

#             transcript = YouTubeTranscriptApi.get_transcript(
#                 video_id, languages=[pick_lang(language)])
#         except Exception as e:
#             print(f"An error occurred for {video_url}: {e}")
#             continue

#         # Save the transcript in .srt format
#         file_path = os.path.join(output_dir, f"{title}.srt")
#         with open(file_path, "w", encoding="utf-8") as file:
#             for i, line in enumerate(transcript):
#                 start = format_timestamp(line['start'])
#                 duration = line['start'] + line['duration']
#                 end = format_timestamp(duration)
#                 text = line['text']
#                 file.write(f"{i + 1}\n{start} --> {end}\n{text}\n\n")

#         print(f"Successfully saved transcript for {video_url}")

# output_dir = "transcripts"
# fetch_and_save_transcripts(video_urls, output_dir, "english")

In [None]:
import torch
from transformers import pipeline
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Path to the audio file to be transcribed
audio = "/home/azureuser/cloudfiles/code/Users/Akilesh_Jayakumar/youtube-api/audio/Asking Chennai Youngsters How Much They Earn   ｜ Street Interview ｜ Tamil ｜ Suman Mpm.wav"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Initialize the transcription pipeline
transcribe = pipeline(task="automatic-speech-recognition", model="vasista22/whisper-tamil-medium", chunk_length_s=30, device=device)
transcribe.model.config.forced_decoder_ids = transcribe.tokenizer.get_decoder_prompt_ids(language="ta", task="transcribe")

# Transcribe the audio file
tamil_text = transcribe(audio)["text"]
print('Transcription: ', tamil_text)

# Convert Tamil text to Romanized Tamil
romanized_tamil = transliterate(tamil_text, sanscript.TAMIL, sanscript.ITRANS)
print('Romanized Transcription: ', romanized_tamil)
