In [None]:
!pip install torchaudio transformers python-docx word2number --quiet
!sudo apt-get install ffmpeg -y

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
import torchaudio
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import re
from docx import Document
from datetime import datetime
from google.colab import files
import os
from word2number import w2n

In [None]:
def convert_romanian_numbers(text):
    numere = {
        "unu": "1", "una": "1", "doi": "2", "două": "2", "trei": "3", "patru": "4",
        "cinci": "5", "șase": "6", "sapte": "7", "șapte": "7", "opt": "8", "nouă": "9",
        "zece": "10", "unsprezece": "11", "doisprezece": "12", "douăsprezece": "12",
        "treisprezece": "13", "paisprezece": "14", "cincisprezece": "15",
        "șaisprezece": "16", "șaptesprezece": "17", "optsprezece": "18", "nouăsprezece": "19",
        "douăzeci": "20"
    }

    words = text.split()
    converted = []

    for word in words:
        if word in numere:
            converted.append(numere[word])
        else:
            converted.append(word)

    return " ".join(converted)


In [None]:
def normalize_transcription(text):
    corecturi = {
        "a orita la": "aorta la",
        "fracție de ejecție": "fracția de ejecție",
        "peredeposterior": "peretele posterior",

        "doi sprăzece": "douăsprezece",
        "trei sprăzece": "treisprezece",
        "patru sprezece": "paisprezece",
        "cinci sprezece": "cincisprezece",

        "a orita": "aorta",
        "a se": "AS",
        "vede": "VD",
        "vese": "VS",
        "siv": "SIV",
        "pe": "PP",

        "la sută": "la sută"
    }

    for gresit, corect in corecturi.items():
        text = text.replace(gresit, corect)

    return text


In [None]:
import re

def cleanup_transcription(text):
    text = re.sub(r'(\d)([a-zăîâșț])', r'\1 \2', text)
    text = re.sub(r'([a-zăîâșț])(\d)', r'\1 \2', text)
    return text


In [None]:
uploaded = files.upload()
for filename in uploaded.keys():
    base, _ = os.path.splitext(filename)
    wav_filename = base + ".wav"
    !ffmpeg -i "{filename}" -ar 16000 -ac 1 "{wav_filename}"
    break


Saving test3.ogg to test3.ogg
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-li

In [None]:
processor = Wav2Vec2Processor.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
model = Wav2Vec2ForCTC.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
model.eval()



Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [None]:
def transcribe_wav_file(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

    input_values = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt").input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    raw_text = processor.decode(predicted_ids[0]).lower()

    text = normalize_transcription(raw_text)

    text = convert_romanian_numbers(text)

    text = cleanup_transcription(text)

    return text



def extract_medical_values(transcription):
    values = {
        "Ao_inel": re.search(r"aorta la inel[^\d]{0,5}(\d+)", transcription),
        "Ao_sinusuri": re.search(r"sinusuri[^\d]*(\d+)", transcription),
        "Ao_ascendenta": re.search(r"ascendent[^\d]*(\d+)", transcription),
        "AS": re.search(r"atriul stâng|AS[^\d]*(\d+)", transcription),
        "VD": re.search(r"ventriculul drept|VD[^\d]*(\d+[-/]?\d*)", transcription),
        "SIV": re.search(r"sept.*interventricular|SIV[^\d]*(\d+)", transcription),
        "VS": re.search(r"ventriculul stâng|VS[^\d]*(\d+/\d+)", transcription),
        "PP": re.search(r"peretele posterior|PP[^\d]*(\d+)", transcription),
        "FE": re.search(r"fracți[ae] de ejecție[^\d>]*[>]?(\d+)", transcription),
        "TAP": re.search(r"TAP[^\d]*(\d+)", transcription),
        "APdr": re.search(r"arter[ăa] pulmonar[ăa] dreapt[ăa][^\d]*(\d+)", transcription),
        "Apstg": re.search(r"arter[ăa] pulmonar[ăa] stâng[ăa][^\d]*(\d+)", transcription),
        "PCA": re.search(r"canal arterial.*?(\d+)", transcription),
        "Vmax_aorta": re.search(r"aortic[ăa].*?Vmax[^\d]*(\d+[\.,]?\d*)", transcription),
        "Vmax_pulm": re.search(r"pulmonar[ăa].*?Vmax[^\d]*(\d+[\.,]?\d*)", transcription),
        "Vmax_tricuspidian": re.search(r"tricuspidian[ăa].*?Vmax[^\d]*(\d+[\.,]?\d*)", transcription),
        "PSVD": re.search(r"presiune sistolic[ăa] VD[^\d]*(\d+[+-]?\d+)", transcription)
    }
    return {k: v.group(1) if v else "nedefinit" for k, v in values.items()}



In [None]:
def save_values_to_word(values_dict, filename="rezultat_fisa_medicala.docx"):
    doc = Document()
    doc.add_heading("Date extrase din voce - Ecocardiografie", 0)
    doc.add_paragraph(f"Data generării: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    doc.add_paragraph("")
    table = doc.add_table(rows=1, cols=2)
    table.style = 'Table Grid'
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'Parametru'
    hdr_cells[1].text = 'Valoare'
    for param, val in values_dict.items():
        row_cells = table.add_row().cells
        row_cells[0].text = param.replace("_", " ")
        row_cells[1].text = val
    doc.add_paragraph("\n* Document generat automat din transcriere vocală.")
    doc.save(filename)
    return filename


In [None]:
transcript = transcribe_wav_file(wav_filename)
print("🔊 Transcriere numerică:\n", transcript)

extracted = extract_medical_values(transcript)
print("\n📋 Valori extrase:")
for k, v in extracted.items():
    print(f"{k}: {v}")

docx_file = save_values_to_word(extracted)
files.download(docx_file)


🔊 Transcriere numerică:
 aorta la inel 8 a orta la si nosuri 2 sprezece a ort acendentăzece aste 3 sprezece VD 6 SIV 3 veste 20 PP 2 sprezece rede posterior 4 fracți de ijecție șaezeci la sutăteaPPtrei sprăzec ge bar 6 bară șanseî valva ortică ve max 1 virglă 2 valva pul monară ve max unul virglă 1 valva tricuspidiană ve max 2 virglă 5 prese VD douăși 4 pru cinciarcaortic ve max 1 virglă treialte detalii ila car merge băgat freceamic restrictiv cu diametru de aproximatii un milimentru

📋 Valori extrase:
Ao_inel: 8
Ao_sinusuri: nedefinit
Ao_ascendenta: nedefinit
AS: nedefinit
VD: 6
SIV: 3
VS: nedefinit
PP: 2
FE: nedefinit
TAP: nedefinit
APdr: nedefinit
Apstg: nedefinit
PCA: nedefinit
Vmax_aorta: nedefinit
Vmax_pulm: nedefinit
Vmax_tricuspidian: nedefinit
PSVD: nedefinit


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>