## Whisper implementation

In [10]:
import whisper
import pandas as pd
import os

# Cargar modelo de Whisper (elige "tiny", "base", "small", "medium" o "large")
model = whisper.load_model("small")  # Puedes cambiar el tamaño del modelo

# Ruta del archivo de audio (modifica con tu archivo)
audio_path = "audio_test.mp3"  # Puede ser .mp3, .wav, .m4a, .ogg, etc.

# Transcribir el audio
result = model.transcribe(audio_path)

# Extraer el texto transcrito
transcription = result["text"]

# Definir nombre del archivo CSV
csv_file = "transcriptions.csv"

# Verificar si el archivo existe para escribir encabezados solo si es la primera vez
file_exists = os.path.isfile(csv_file)

# Crear DataFrame con la nueva transcripción
df = pd.DataFrame({"Texto": [transcription]})

# Guardar en el CSV sin sobrescribir datos anteriores (modo 'a' de append)
df.to_csv(csv_file, mode='a', header=not file_exists, index=False, encoding='utf-8')

# Imprimir la transcripción
print("Transcripción agregada a 'transcriptions.csv':")
print(transcription)




Transcripción agregada a 'transcriptions.csv':
 Hello, my name is Romain and my favorite day of the week is Monday because from 8 to 9.50 we have PE, then we have a 15 minute break, then we have PE again from 10.00 5.00 to 11.00. It is so fun. After that from 11.00 to 11.55 we have music, then we have lunch from 11.55 to 1.10. After lunch from 1.10 to 2.5 we have French and to finish off the day we have Spanish. I also like Wednesdays because from 1.10 to 2.5 we have art that is one of my very favorite classes. I don't really like Tuesdays because there isn't any of my favorite classes and we have to start the day off with math. Thank you for listening. Bye!


## NLTK text Tokenization

In [11]:
import nltk
import pandas as pd

# Load the text from the csv file
df = pd.read_csv("transcriptions.csv")

# Tokenize the last entry in the 'Texto' column
if not df.empty and 'Texto' in df.columns:
    text = df['Texto'].iloc[-1]  # Get the last entry
    tokens = nltk.word_tokenize(str(text))  # Convert to string in case of NaN
    print(tokens)
else:
    print("The CSV file is empty or does not contain the 'Texto' column.")


['Hello', ',', 'my', 'name', 'is', 'Romain', 'and', 'my', 'favorite', 'day', 'of', 'the', 'week', 'is', 'Monday', 'because', 'from', '8', 'to', '9.50', 'we', 'have', 'PE', ',', 'then', 'we', 'have', 'a', '15', 'minute', 'break', ',', 'then', 'we', 'have', 'PE', 'again', 'from', '10.00', '5.00', 'to', '11.00', '.', 'It', 'is', 'so', 'fun', '.', 'After', 'that', 'from', '11.00', 'to', '11.55', 'we', 'have', 'music', ',', 'then', 'we', 'have', 'lunch', 'from', '11.55', 'to', '1.10', '.', 'After', 'lunch', 'from', '1.10', 'to', '2.5', 'we', 'have', 'French', 'and', 'to', 'finish', 'off', 'the', 'day', 'we', 'have', 'Spanish', '.', 'I', 'also', 'like', 'Wednesdays', 'because', 'from', '1.10', 'to', '2.5', 'we', 'have', 'art', 'that', 'is', 'one', 'of', 'my', 'very', 'favorite', 'classes', '.', 'I', 'do', "n't", 'really', 'like', 'Tuesdays', 'because', 'there', 'is', "n't", 'any', 'of', 'my', 'favorite', 'classes', 'and', 'we', 'have', 'to', 'start', 'the', 'day', 'off', 'with', 'math', '.',

## Sentiment Analysis

In [12]:
#We realize sentiment analysis from the tokenized text
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# Get the sentiment score
sentiment_score = sia.polarity_scores(text)
print(sentiment_score)


{'neg': 0.0, 'neu': 0.845, 'pos': 0.155, 'compound': 0.964}


In [None]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"  # Evita que use TensorFlow

from transformers import pipeline

# Convertimos la lista de tokens en una cadena
texto = " ".join(tokens)

# Cargamos el pipeline de NER usando PyTorch y código remoto
ner_pipeline = pipeline(
    "token-classification",
    model="AventIQ-AI/roberta-named-entity-recognition",
    aggregation_strategy="simple",
    trust_remote_code=True,
    framework="pt"
)

# Aplicamos el modelo al texto
entidades = ner_pipeline(texto)

# Mostramos las entidades detectadas
for entidad in entidades:
    print(f"- {entidad['word']} → {entidad['entity_group']} (score: {entidad['score']:.2f})")




  _torch_pytree._register_pytree_node(


RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
cannot import name 'float8_e4m3b11fnuz' from 'tensorflow.python.framework.dtypes' (/Users/carlosillanaldariz/miniconda3/lib/python3.11/site-packages/tensorflow/python/framework/dtypes.py)