In [1]:
import os
import glob
import pandas as pd
import torch
import time
import librosa
from tqdm import tqdm_notebook
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import asyncio
import requests
from requests.exceptions import Timeout, RequestException


from moviepy.editor import VideoFileClip
from pydub import AudioSegment
from requests.exceptions import ChunkedEncodingError



import sys
sys.path.append("../")
from src.yappy_search.audio_models import SongRecognition

In [2]:
# Считываем данные из таблицы (CSV или Excel, в зависимости от формата)
file_path = '../data/yappy_hackaton_2024_40k_small.csv'  # укажите правильный путь к файлу
df = pd.read_csv(file_path)  # если у вас Excel, используйте pd.read_excel(file_path)
df = df.rename(columns={'Unnamed: 0': 'index orig'})

dir_w_video = f'../output/shazam/data_video/'
os.makedirs(dir_w_video, exist_ok=True)
df_part = df.iloc[15000:20001]

song_recognition = SongRecognition()

batch_size_records = 250
num_batches = (len(df_part) + batch_size_records - 1) // batch_size_records

# Обработка батчами по 1000 записей

In [3]:
# Функция для скачивания видео с тайм-аутом
def download_video(url: str, output_path: str, timeout: int = 60) -> requests.models.Response:
    """Функция для скачивания видео с тайм-аутом"""
    try:
        response = requests.get(url, stream=True, timeout=timeout)
        if response.status_code == 200:
            with open(output_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            return response
        else:
            print(f"Failed to download {url}")
    except (Timeout, RequestException) as e:
        print(f"Error downloading {url}: {e}")


# Функция для повторных попыток скачивания видео
def robust_download_video(video_url, video_path, max_retries=3):
    retries = 0
    while retries < max_retries:
        try:
            response = download_video(video_url, video_path)
            return response
        except ChunkedEncodingError as e:
            print(f"Error downloading {video_url}: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying... ({retries}/{max_retries})")
            else:
                print(f"Failed to download {video_url} after {max_retries} attempts")
                return None

In [4]:
# Функция для извлечения аудио с проверкой
def extract_audio_with_check(video_path, output_dir):
    try:
        video_clip = VideoFileClip(video_path)
        audio_file_path = os.path.join(
            output_dir, os.path.basename(video_path).replace(".mp4", ".mp3")
        )
        if video_clip.audio is None:
            print(f"No audio found in {video_path}. Creating empty audio file.")
            # Создаем пустой аудиофайл
            silent_audio = AudioSegment.silent(duration=1000)  # 1 секунда тишины
            silent_audio.export(audio_file_path, format="mp3")
        else:
            video_clip.audio.write_audiofile(audio_file_path, verbose=False, logger=None)
        return audio_file_path
    except Exception as e:
        # Создаем пустой аудиофайл
        audio_file_path = os.path.join(
            output_dir, os.path.basename(video_path).replace(".mp4", ".mp3")
        )
        silent_audio = AudioSegment.silent(duration=1000)  # 1 секунда тишины
        silent_audio.export(audio_file_path, format="mp3")
        print(f"Failed to extract audio from {video_path}: {e}. Creating empty audio file.")
        return audio_file_path

# Ограничение запросов к Shazam в минуту
async def recognize_audio_with_rate_limiting(audio_path, rate_limit=60):
    await asyncio.sleep(60 / rate_limit)  # Задержка для ограничения запросов
    try:
        return await song_recognition.recognize_audio(audio_path)
    except ClientError as e:
        print(f"Request failed for {audio_path}: {e}")
        return 'ERROR!'

In [5]:
for batch_index in tqdm_notebook(range(0, num_batches)):
    print(f'--num_batches: {batch_index}')
    df_batch = df_part.iloc[batch_index*batch_size_records:(batch_index+1)*batch_size_records]

    # Скачиваем все видео из списка
    for index, row in tqdm_notebook(df_batch.iterrows(), total=len(df_batch)):
        index_name = row['index orig']
        video_url = row['link']
        description = row['description']
        video_name = f"{index_name}{'_'.join(video_url.split('media')[1].split('/'))}"  # можно использовать описание для имени файла
        video_path = os.path.join(dir_w_video, video_name)
        if os.path.isfile(video_path):
            continue
        response = robust_download_video(video_url, video_path)
        # print(f"Downloaded {video_name}.")
    print("All videos downloaded successfully.")

    # Получение списка всех файлов с расширением .mp4 в директории
    video_files = glob.glob(os.path.join(dir_w_video, '*.mp4'))
    # Сортировка списка файлов по первому значению при split('_')
    sorted_video_files = sorted(video_files, key=lambda x: int(os.path.basename(x).split('_')[0]))

    # переводим из видео в аудио
    audio_output_dir = os.path.join(dir_w_video.rsplit('/', 2)[0], f'data_audio')
    os.makedirs(audio_output_dir, exist_ok=True)
    audio_paths = []
    for video_path in tqdm_notebook(sorted_video_files):
        audio_file_path = os.path.join(
            audio_output_dir, os.path.basename(video_path).replace(".mp4", ".mp3")
        )
        if os.path.isfile(audio_file_path):
            pass
        else:
            audio_file_path = extract_audio_with_check(video_path, audio_output_dir)
        audio_paths.append(audio_file_path)
    print("All audios downloaded successfully.")

    assert  len(audio_paths) == len(sorted_video_files), 'audio video error'
    # Транскрибация аудио
    transcriptions = []
    transcriptions_all = []

    recognitions_all = []
    title_shazam_all = []
    subtitle_shazam_all = []
    url_shazam_all = []

    start_time = time.time()
    for audio_path in tqdm_notebook(audio_paths):
        try:
            recognition = await recognize_audio_with_rate_limiting(audio_path)
        except Exception as e:
            raise e
            recognition = 'ERROR!'
            print(f"{audio_path}| recognition ERROR")
    # shazam
        if type(recognition) == str:
            title_shazam = ''
            subtitle_shazam = ''
            url_shazam = ''
        else:
            title_shazam = recognition['title']
            subtitle_shazam = recognition['subtitle']
            url_shazam = recognition['url']

        title_shazam_all.append(title_shazam)
        subtitle_shazam_all.append(subtitle_shazam)
        url_shazam_all.append(url_shazam)
    df_batch['title shazam'] = title_shazam_all
    df_batch['subtitle shazam'] = subtitle_shazam_all
    df_batch['url shazam'] = url_shazam_all

    index_1 = df_batch['index orig'].iloc[0]
    index_2 = df_batch['index orig'].iloc[-1]
    save_sub_path = f'../output/shazam/parquets/yappy_hackaton_2024_40k_sub_with_shazam__{index_1}_{index_2}.pq'
    df_batch.to_parquet(save_sub_path,index=False)

    # Очищение файлов после обработки
    for video_path in sorted_video_files:
        os.remove(video_path)
    for audio_path in audio_paths:
        if audio_path:
            os.remove(audio_path)

  0%|          | 0/21 [00:00<?, ?it/s]

--num_batches: 0


  0%|          | 0/250 [00:00<?, ?it/s]

All videos downloaded successfully.


  0%|          | 0/250 [00:00<?, ?it/s]

All audios downloaded successfully.


  0%|          | 0/250 [00:00<?, ?it/s]

NameError: name 'ClientError' is not defined