<a href="https://colab.research.google.com/github/Yuvahre/Yuvahre/blob/main/kanada_asr_model_finetuning_and_q_a.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import librosa
import noisereduce as nr
import soundfile as sf
import numpy as np
import os
from pydub import AudioSegment, effects
import scipy.signal as signal

input_dir = r"D:\Documents\sem5-docs\sandlewood\data"
output_dir = r"D:\Documents\sem5-docs\sandlewood\output"
os.makedirs(output_dir, exist_ok=True)

def high_pass_filter(data, cutoff=80, fs=16000, order=5):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    y = signal.lfilter(b, a, data)
    return y

def change_pitch(audio, semitones):
    return audio._spawn(audio.raw_data, overrides={"frame_rate": int(audio.frame_rate * (2.0 ** (semitones / 12.0)))})

# Process each audio file
for filename in os.listdir(input_dir):
    if filename.endswith(".mp3") or filename.endswith(".wav"):
        file_path = os.path.join(input_dir, filename)
        y, sr = librosa.load(file_path, sr=None)

        # Step 1: Noise reduction
        reduced_noise = nr.reduce_noise(y=y, sr=sr)

        intermediate_path = os.path.join(output_dir, f"{filename}_denoised.wav")
        sf.write(intermediate_path, reduced_noise, sr)

        # Step 2: Normalize volume using pydub
        audio = AudioSegment.from_file(intermediate_path)
        normalized_audio = effects.normalize(audio)

        normalized_audio_path = os.path.join(output_dir, f"{filename}_normalized.wav")
        normalized_audio.export(normalized_audio_path, format="wav")

        # Step 3: Apply high-pass filter
        y, sr = librosa.load(normalized_audio_path, sr=None)
        filtered_audio = high_pass_filter(y, cutoff=80, fs=sr)

        eq_audio_path = os.path.join(output_dir, f"{filename}_eq.wav")
        sf.write(eq_audio_path, filtered_audio, sr)

        # Step 4: De-reverb
        S = librosa.stft(filtered_audio)
        S_db = librosa.amplitude_to_db(abs(S))
        S_smooth = librosa.decompose.nn_filter(S_db, aggregate=np.median, metric='cosine')

        if np.any(np.isnan(S_smooth)) or np.any(np.isinf(S_smooth)):
            print(f"Warning: NaNs or Infinities detected in the smoothed spectrogram for {filename}")

        S_smooth_amplitude = librosa.db_to_amplitude(S_smooth)
        dereverb_audio = librosa.istft(S_smooth_amplitude)
        dereverb_audio = np.clip(dereverb_audio, -1, 1)

        dereverb_audio_path = os.path.join(output_dir, f"{filename}_dereverb.wav")
        sf.write(dereverb_audio_path, dereverb_audio, sr)

        adjusted_pitch_audio = change_pitch(normalized_audio, -2)
        pitch_adjusted_path = os.path.join(output_dir, f"{filename}_pitch_adjusted.wav")
        adjusted_pitch_audio.export(pitch_adjusted_path, format="wav")

        y, sr = librosa.load(pitch_adjusted_path, sr=None)
        y_resampled = librosa.resample(y, orig_sr=sr, target_sr=16000)

        final_output_path = os.path.join(output_dir, f"{filename}_processed.wav")
        sf.write(final_output_path, y_resampled, 16000)

In [None]:
import os
import torch
import soundfile as sf
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import pandas as pd
from pydub import AudioSegment
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import gc
import librosa

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

forced_decoder_ids = processor.get_decoder_prompt_ids(language="kn", task="transcribe")
model.config.forced_decoder_ids = forced_decoder_ids

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

def transcribe_audio(file_path):
    try:
        audio, _ = sf.read(file_path)
        audio_resampled = np.mean(audio, axis=1) if len(audio.shape) > 1 else audio
        result = pipe(audio_resampled, return_timestamps=True)
        print(file_path)
        print(result["text"])
        return {"file": os.path.basename(file_path), "transcription": result["text"]}
    except Exception as e:
        print(f"Error transcribing {file_path}: {e}")
        return None


def transcribe_dataset(dataset_directory):
    file_paths = [os.path.join(dataset_directory, f) for f in os.listdir(dataset_directory) if f.endswith(".wav")]
    transcriptions = []
    for file_path in file_paths:
        result = transcribe_audio(file_path)
        if result:
            transcriptions.append(result)
        gc.collect()

    df = pd.DataFrame(transcriptions)
    df.to_csv('transcriptions.csv', index=False)
    print("Transcriptions saved to 'transcriptions.csv'")


def split_audio(file_path, chunk_length_ms=10000):
    audio = AudioSegment.from_file(file_path)
    chunk_paths = []
    for i, chunk in enumerate(audio[::chunk_length_ms]):
        chunk_name = f"{file_path}_chunk{i}.wav"
        chunk.export(chunk_name, format="wav")
        chunk_paths.append(chunk_name)
    return chunk_paths

def extract_mfcc(file_path, n_mfcc=13):
    try:
        audio, sr = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfccs.T, axis=0).tolist()
    except Exception as e:
        print(f"Error extracting MFCCs from {file_path}: {e}")
        return []

def save_features(dataset_directory):
    features = []
    for filename in os.listdir(dataset_directory):
        if filename.endswith(".wav"):
            file_path = os.path.join(dataset_directory, filename)
            mfccs = extract_mfcc(file_path)
            features.append({"file": filename, "mfccs": mfccs})
    pd.DataFrame(features).to_csv('features.csv', index=False)
    print("MFCCs saved to 'features.csv'")

def speed_augmentation(audio, rate=1.2):
    return np.interp(np.arange(0, len(audio), rate), np.arange(0, len(audio)), audio)

def noise_augmentation(audio, noise_factor=0.005):
    noise = np.random.randn(len(audio))
    return audio + noise_factor * noise

def augment_dataset(dataset_directory):
    for filename in os.listdir(dataset_directory):
        if filename.endswith(".wav"):
            file_path = os.path.join(dataset_directory, filename)
            audio, sr = sf.read(file_path)

            sf.write(file_path.replace(".wav", "_speed.wav"), speed_augmentation(audio, 1.2), sr)
            sf.write(file_path.replace(".wav", "_noisy.wav"), noise_augmentation(audio, 0.005), sr)

def structure_dataset_for_asr(dataset_directory, transcriptions_csv):
    transcriptions = pd.read_csv(transcriptions_csv).set_index("file").to_dict()["transcription"]
    structured_data = []

    for filename in os.listdir(dataset_directory):
        if filename.endswith(".wav") and filename in transcriptions:
            structured_data.append({"audio_path": os.path.join(dataset_directory, filename), "transcription": transcriptions[filename]})

    pd.DataFrame(structured_data).to_csv('structured_dataset.csv', index=False)
    print("Structured dataset saved to 'structured_dataset.csv'")

def prepare_asr_dataset(dataset_directory):
    transcribe_dataset(dataset_directory)

    for filename in os.listdir(dataset_directory):
        if filename.endswith(".wav"):
            split_audio(os.path.join(dataset_directory, filename))

    save_features(dataset_directory)

    augment_dataset(dataset_directory)

    structure_dataset_for_asr(dataset_directory, 'transcriptions.csv')

prepare_asr_dataset(r"D:\Documents\sem5-docs\sandlewood\output")

In [None]:
pip install transformers datasets torch evaluate jiwer

In [None]:
from datasets import Dataset, DatasetDict, Audio
import pandas as pd

df = pd.read_csv("structured_dataset.csv")

dataset = Dataset.from_pandas(df)

dataset = dataset.cast_column("audio_path", Audio())
dataset = dataset.rename_column("audio_path", "audio")
dataset = dataset.rename_column("transcription", "text")

split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
print("Dataset prepared successfully!")


In [None]:
from transformers import AutoProcessor

# Load processor for Whisper
processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")

def preprocess(batch):
    audio = batch["audio"]
    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt")

    with processor.as_target_processor():
        labels = processor(batch["text"], return_tensors="pt").input_ids

    batch["input_features"] = inputs.input_features[0]
    batch["labels"] = labels[0]
    return batch

prepared_dataset = split_dataset.map(preprocess, remove_columns=["audio", "text"])


In [None]:
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load Whisper model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=10000,
    save_steps=1000,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
)

from evaluate import load
wer_metric = load("wer")

def compute_metrics(eval_preds):
    pred_ids, label_ids = eval_preds
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=prepared_dataset["train"],
    eval_dataset=prepared_dataset["test"],
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model("./whisper-finetuned")
processor.save_pretrained("./whisper-finetuned")


In [None]:
from transformers import pipeline
import librosa

finetuned_model = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned")
finetuned_processor = AutoProcessor.from_pretrained("./whisper-finetuned")

asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=finetuned_model,
    tokenizer=finetuned_processor.tokenizer,
    feature_extractor=finetuned_processor.feature_extractor,
    device=device,
)

audio_path = "/content/drive/MyDrive/audio.wav"
audio, sr = librosa.load(audio_path, sr=16000)
result = asr_pipeline(audio)

with open("kanada_results.txt", "w", encoding="utf-8") as f:
    f.write(result["text"])

In [1]:
pip install transformers torch nltk



In [3]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
try:
    nltk.data.find('tokenizers/punkt')
    print("Punkt resource found.")
except LookupError:
    print("Punkt resource not found.")

Punkt resource found.


In [10]:
file_path = '/content/kanada results.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    kannada_text = file.read()

print("Kannada text loaded successfully!")


Kannada text loaded successfully!


In [11]:
from transformers import AutoTokenizer

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def split_into_chunks(text, tokenizer, max_length=512):
    """
    Splits text into manageable chunks of tokens within max_length.
    """
    tokens = tokenizer.tokenize(text)
    chunks = []

    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i+max_length]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)

    return chunks

chunks = split_into_chunks(kannada_text, tokenizer)
print(f"Number of chunks: {len(chunks)}")


Token indices sequence length is longer than the specified maximum sequence length for this model (19338 > 512). Running this sequence through the model will result in indexing errors


Number of chunks: 38


In [12]:
from transformers import AutoModelForQuestionAnswering, pipeline

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

print("Model and pipeline loaded successfully!")

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Model and pipeline loaded successfully!


In [13]:
questions = [
    "ಈ ಪಠ್ಯದ ಮುಖ್ಯ ಅಂಶಗಳು ಯಾವವು?",
]

for question in questions:
    print(f"Question: {question}")
    for i, chunk in enumerate(chunks):
        try:
            answer = qa_pipeline(question=question, context=chunk)
            print(f"Chunk {i+1}: {answer['answer']}")
        except Exception as e:
            print(f"Error in Chunk {i+1}: {str(e)}")
    print("-" * 50)


Question: ಈ ಪಠ್ಯದ ಮುಖ್ಯ ಅಂಶಗಳು ಯಾವವು?
Chunk 1: ಸರ್ವಶಕ್ತನಾಗಲು. ಮಾವಿನ ಹಣ್ಣನ್ನು
Chunk 2: ಆತಾರ 111 ಕೆಜಿ ಗುಣ. ಕತ್ತರಿಸಿ.
Chunk 3: ನೇಡಿದಾನೆ. ಮರ ಆಧಾರಿತ ಕ್ರಿಷಿಯಿಂದ ಹೇಗೆ
Chunk 4: ರೇಸೆಮಿ
Chunk 5: ರಕ್ತಚಿಂದನ ಕಳ್ಳರು
Chunk 6: ವೇರತ್ತುವು.
Chunk 7: ಊಹೆ ತಪ್ಪಾಗಿತ್ತು. ಒಂದೇ ನಿಮಿಷದಲ್ಲಿ
Chunk 8: ಹೇಳಲುಂ ಇದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದ
Chunk 9: ಜಾಗತಾವಾಗು
Chunk 10: ಸ್ಮಾರು ಹತ್ರಿಂದ
Chunk 11: ತಿಮರು ಕಳ್ಳುರು.
Chunk 12: వారా సాండేల్లుక్ కల్ట్రేషం
Chunk 13: అదికట్స్తుపార్తుపార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్ప
Chunk 14: మిస్వర్కులు స్టీన్నే
Chunk 15: సాపించాలుం కేస్టు సాపించాలుం కేస్టు సాపించాలుం
Chunk 16: ಮಾಡಿರ್ತ ಕಂದದ ಮರಗಳಿಗೆ ಎಕಾಯಿಕ
Chunk 17: ತೆಗಳಿ ಕೊಟ್ಟುವೇ
Chunk 18: ವಾದಿಸಿದರು.

In [14]:
for question in questions:
    all_answers = []
    for chunk in chunks:
        try:
            result = qa_pipeline(question=question, context=chunk)
            all_answers.append(result['answer'])
        except:
            continue
    final_answer = " ".join(all_answers)
    print(f"Final Answer for '{question}': {final_answer}")
    print("-" * 50)


Final Answer for 'ಈ ಪಠ್ಯದ ಮುಖ್ಯ ಅಂಶಗಳು ಯಾವವು?': ಸರ್ವಶಕ್ತನಾಗಲು. ಮಾವಿನ ಹಣ್ಣನ್ನು ಆತಾರ 111 ಕೆಜಿ ಗುಣ. ಕತ್ತರಿಸಿ. ನೇಡಿದಾನೆ. ಮರ ಆಧಾರಿತ ಕ್ರಿಷಿಯಿಂದ ಹೇಗೆ ರೇಸೆಮಿ ರಕ್ತಚಿಂದನ ಕಳ್ಳರು ವೇರತ್ತುವು. ಊಹೆ ತಪ್ಪಾಗಿತ್ತು. ಒಂದೇ ನಿಮಿಷದಲ್ಲಿ ಹೇಳಲುಂ ಇದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದಿದ ಜಾಗತಾವಾಗು ಸ್ಮಾರು ಹತ್ರಿಂದ ತಿಮರು ಕಳ್ಳುರು. వారా సాండేల్లుక్ కల్ట్రేషం అదికట్స్తుపార్తుపార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్పార్తుప్ప మిస్వర్కులు స్టీన్నే సాపించాలుం కేస్టు సాపించాలుం కేస్టు సాపించాలుం ಮಾಡಿರ್ತ ಕಂದದ ಮರಗಳಿಗೆ ಎಕಾಯಿಕ ತೆಗಳಿ ಕೊಟ್ಟುವೇ ವಾದಿಸಿದರು. ಕೊಡ್ತಿವಾರಿಡಿದಿದೀ ಅತ್ತುಗರಿಗೆ ಪುಡಿಯನ್ನ ಎರಡು ಲೋಟಾ ನೀರಿಕೆ ಒಂದು ಚಮಚ ಸ್ತಿಗಂದದ ಕಡಮೆಯಾಕ್ತವರ್ತುದೇ ಕಲೆಗಳು ಕಡ್ಮೆ ತೆಗೆದುಹಾಕುತ್ತೇವೆ. ಹೊಗಿದರೆ ಅಮ್ಮಿಲ್ಲಿ , ಇದು, ಇದು, ಇದು, ಇದು, 