In [None]:
import os
os.environ["PYTHONPATH"] = "/pfs/data5/home/kit/stud/u____/.conda/envs/salmonn/lib/python3.9/site-packages:" + os.environ.get("PYTHONPATH", "")
os.environ["PATH"] = "/pfs/data5/home/kit/stud/u____/.conda/envs/salmonn/bin:" + os.environ["PATH"]

# 1. Getting Started

## 1.1 Install SALMONN

Before we begin, load miniconda from the left sidebar.

In [None]:
# !git clone https://github.com/bytedance/SALMONN.git

# !conda create -n salmonn python=3.9.17
# !conda activate salmonn

# !conda install pytorch torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia

# !pip install peft==0.3.0
# !pip install soundfile
# !pip install librosa
# !pip install transformers==4.28.0
# !pip install sentencepiece==0.1.97
# !pip install accelerate==0.20.3
# !pip install bitsandbytes==0.35.0
# !pip install gradio==3.23.0

# !pip install omegaconf

## 1.2 Download models

They are sorted here: /pfs/data5/home/kit/stud/u____/.cache/huggingface/hub

In [None]:
# !huggingface-cli download openai/whisper-large-v2
# !huggingface-cli download lmsys/vicuna-7b-v1.5
# !huggingface-cli download tsinghua-ee/SALMONN-7B

# The BEATs model can be downloaded from: https://www.google.com/url?q=https%3A%2F%2F1drv.ms%2Fu%2Fs%21AqeByhGUtINrgcpj8ujXH1YUtxooEg%3Fe%3DE9Ncea

# 2. Data Preparation

## 2.1 Download dataset

In [None]:
# !pip install datasets==2.7.1
# !pip install huggingface_hub

# from datasets import load_dataset
# from huggingface_hub import login

# login(token="your token")
# dataset = load_dataset("mozilla-foundation/common_voice_4_0", "de")

Download CoVoST2 translations.

In [None]:
# Download CoVoST2 translations from:
# https://dl.fbaipublicfiles.com/covost/covost_v2.de_en.tsv.tar.gz

Get data splits. Download this script from [CoVoST2](https://github.com/facebookresearch/covost).

In [None]:
# !python3 get_covost_splits.py \
#   --version 2 --src-lang de --tgt-lang en \
#   --root tsv_for_S3 \
#   --cv-tsv /pfs/data5/home/kit/stud/u____/.cache/huggingface/datasets/downloads/extracted/____/validated.tsv

## 2.2 Data preprocess

We should format our data such that each sample consist of:
- Path to the audio  
- Target text (translation)  
- Task name

In [None]:
import csv
import json

def convert_tsv_to_json(tsv_path, json_path):
    annotations = []

    with open(tsv_path, 'r', encoding='utf-8') as tsv_file:
        reader = csv.DictReader(tsv_file, delimiter='\t')
        for row in reader:
            annotation = {
                "path": "/pfs/data5/home/kit/stud/u____/.cache/huggingface/datasets/downloads/extracted/____/clips/" + row["path"],
                "text": row["translation"],
                "task": "translation_de2en"
            }
            annotations.append(annotation)

    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump({"annotation": annotations}, json_file, ensure_ascii=False, indent=4)

# Conversion
file_map = {
    "tsv_for_S3/covost_v2.de_en.train.tsv": "tsv_for_S3/train.json",
    "tsv_for_S3/covost_v2.de_en.dev.tsv": "tsv_for_S3/dev.json",
    "tsv_for_S3/covost_v2.de_en.test.tsv": "tsv_for_S3/test.json"
}

for tsv, json_file in file_map.items():
    convert_tsv_to_json(tsv, json_file)
    print(f"converted to {json_file}")

Convert mp3 to wav if needed:

In [None]:
import os
import json
from pydub import AudioSegment
from tqdm import tqdm

json_files = ["train.json", "dev.json", "test.json"]

for json_file in json_files:
    print(f"正在处理 {json_file}...")

    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    annotations = data.get("annotation", [])
    
    for item in tqdm(annotations, desc=f"处理 {json_file}"):
        mp3_path = item["path"]
        wav_path = os.path.splitext(mp3_path)[0] + ".wav"

        # 检查 MP3 文件是否存在
        if not os.path.exists(mp3_path):
            print(f"MP3 文件不存在，跳过: {mp3_path}")
            continue

        # 检查目标 WAV 文件是否已存在
        if os.path.exists(wav_path):
            print(f"WAV 文件已存在，跳过: {wav_path}")
            continue

        wav_dir = os.path.dirname(wav_path)
        if not os.path.exists(wav_dir):
            os.makedirs(wav_dir)

        # 加载 MP3 文件并转换为 WAV
        try:
            audio = AudioSegment.from_mp3(mp3_path)
            audio.export(wav_path, format="wav", parameters=["-ar", "16000"])
            print(f"已生成 WAV 文件: {wav_path}")
        except Exception as e:
            print(f"转换失败: {mp3_path}, 错误: {e}")
    
    print(f"{json_file} 处理完成！")

print("所有 JSON 文件已处理完成！")

#####################################################################

for json_file in json_files:
    print(f"正在处理 {json_file}...")

    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    if "annotation" in data and isinstance(data["annotation"], list):
        for item in tqdm(data["annotation"], desc=f"处理 {json_file}"):
            if "path" in item and item["path"].endswith(".mp3"):
                item["path"] = os.path.splitext(item["path"])[0] + ".wav"

    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    print(f"{json_file} 修改完成！")

print("所有 JSON 文件的路径已更新为 .wav！")

# 3. Translation with raw SALMONN

## 3.1 Generate

In [None]:
# modify cli_inderence.py & decode_config.yaml
# resample wav file in utils.py if sr!=16k
# Tesla V100 doesn't support Int8: pip install bitsandbytes==0.37.2
!python3 SALMONN/cli_inference.py --cfg-path SALMONN/configs/decode_config.yaml

## 3.2 Evaluation

In [None]:
# BLEU

# 4. Fine-tuning

## 4.1 Train

In [None]:
# modify config.yaml and runner.py so that it can continue training from the position of last epoch.
# modify 'save_checkpoint' function to save all parameters.
# set num_workers = 0
!python3 SALMONN/train.py --cfg-path SALMONN/configs/config.yaml

## 4.2 Generate

In [None]:
# Use the fine-tuned model
!python3 SALMONN/cli_inference.py --cfg-path SALMONN/configs/decode_config.yaml

## 4.3 Evaluation

In [10]:
# BLEU
import sacrebleu

# Load the hypothesis and reference files
with open('tsv_for_S3/hyp_output_after5.txt', 'r') as f:
    hypothesis = f.readlines()

with open('tsv_for_S3/ref_output_after5.txt', 'r') as f:
    reference = f.readlines()

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(hypothesis, [reference])

# Print the BLEU score
print(f"BLEU score: {bleu.score}")

BLEU score: 29.46351347844518
