In [103]:
import spacy
from spacy.matcher import Matcher
import speech_recognition as sr

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Create the Matcher object
matcher = Matcher(nlp.vocab)

# Define patterns for matching cow IDs and milk quantities
patterns = {
    "Extracted_features": [
        [{'LIKE_NUM': True}, {'IS_ALPHA': True, 'IS_STOP': False}],
        [{'IS_ALPHA': True, 'IS_STOP': False}, {'LIKE_NUM': True}],
        [{'IS_ALPHA': True, 'IS_STOP': False, 'OP': '+'}, {'IS_PUNCT': True}, {'LIKE_NUM': True}],
        [{'LIKE_NUM': True}, {'IS_SPACE': True, 'OP': '?'}, {'IS_ALPHA': True, 'IS_STOP': False}],
        [{'LIKE_NUM': True}],
        [{'POS': 'PROPN'}]  # For matching proper nouns directly
    ],
    "QUANTITY_KG": [
        [{'LIKE_NUM': True}, {'LOWER': {'IN': ['kg', 'kilograms']}}]
    ],
    "QUANTITY_L": [
        [{'LIKE_NUM': True}, {'LOWER': {'IN': ['l', 'liters', 'litres']}}]
    ]
}

# Add patterns to the matcher
for label, pattern_list in patterns.items():
    for pattern in pattern_list:
        matcher.add(label, [pattern])

# Initialize the speech recognition recognizer
r = sr.Recognizer()

def transcribe_audio(file_path):
    with sr.AudioFile(file_path) as source:
        audio_data = r.record(source)
        try:
            text = r.recognize_google(audio_data)
            print(f"Transcription: {text}")
            return text
        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand audio")
        except sr.RequestError as e:
            print(f"Could not request results from Google Speech Recognition service; {e}")

def analyze_text(text):
    doc = nlp(text)
    matches = matcher(doc)
    seen_tokens = set()
    filtered_matches = []
    for match_id, start, end in sorted(matches, key=lambda x: x[2] - x[1], reverse=True):
        if any(tok in seen_tokens for tok in range(start, end)):
            continue
        filtered_matches.append((match_id, start, end))
        seen_tokens.update(range(start, end))
    
    print("Extracted Information:")
    for match_id, start, end in filtered_matches:
        span = doc[start:end]  # The matched span
        label = nlp.vocab.strings[match_id]  # Get the label of the match
        if label in ["QUANTITY_KG", "QUANTITY_L"]:
            print(f"{label}: {span[0].text} {span[1].text}")  # Print the number and unit
        else:
            print(f"{label}: {span.text}")

# List of audio file paths
audio_files = ['audio1.wav', 'audio2.wav', 'audio5.wav', 'audio3.wav', 'audio6.wav']

# Transcribe and analyze each audio file
for file_path in audio_files:
    transcribed_text = transcribe_audio(file_path)
    if transcribed_text:
        analyze_text(transcribed_text)


Transcription: the yield of w 704 today is 8 kg
Extracted Information:
Extracted_features: w 704
Extracted_features: 8 kg
Transcription: babila has given 5 kg milk today
Extracted Information:
Extracted_features: given 5
Extracted_features: babila
Transcription: w 7049 Babulal both give 8 kg and 51 l of milk today
Extracted Information:
Extracted_features: w 7049
Extracted_features: 8 kg
Extracted_features: 51 l
Extracted_features: Babulal
Transcription: 6 kg milk given by Shanti
Extracted Information:
Extracted_features: 6 kg
Extracted_features: Shanti
Transcription: today I have Milkar 10 kg from Rosy and 10 kg from Shanti
Extracted Information:
Extracted_features: Milkar 10
Extracted_features: 10 kg
Extracted_features: Rosy
Extracted_features: Shanti


In [44]:
# Using Regular Expressions


import speech_recognition as sr
import re
import logging

# Setting up logging to print to console
logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')

# Function to transcribe audio
def transcribe_audio(audio_file, recognizer, source):
    try:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
        print(f"Transcription for {audio_file}: {text}")  # Print transcription
        return text
    except sr.UnknownValueError:
        logging.error(f"Audio was not understood in file: {audio_file}")
        return None
    except sr.RequestError:
        logging.error(f"Failed to obtain results from file: {audio_file}")
        return None

# Function to extract data from text
def extract_data_from_text(text):
    patterns = [
        r"the yield of (?:w\s)?(\d+) today is (\d+) kg",
        r"(\w+) has given (\d+) kg milk",
        r"(\d+) kg milk given by (\w+)",
        r"(\d+)\s+(\w+)\s+both give (\d+) kg and (\d+) l of milk",
        r"Cow (\d+) gave (\d+) Kg",  # New pattern for "Cow 32 gave 8 Kg"
        r"Milked (\d+) kg from (\w+)"  # New pattern for "Milked 10 kg from Daisy"
    ]
    for pattern in patterns:
        matches = re.finditer(pattern, text)
        for match in matches:
            groups = match.groups()
            if pattern.startswith("(\d+) kg"):  # Reverse if the number comes before the name
                groups = groups[::-1]
            if len(groups) == 4:  # Handling complex pattern
                print(f"Match found: Cow IDs: {groups[0]}, {groups[1]}, Milk Yields: {groups[2]} kg, {groups[3]} l")
            else:
                cow_id, milk_yield = groups
                print(f"Match found: Cow ID: {cow_id}, Milk Yield: {milk_yield} kg")

# Main function to process files
def process_files(file_list):
    recognizer = sr.Recognizer()
    
    for file in file_list:
        with sr.AudioFile(file) as source:
            transcription = transcribe_audio(file, recognizer, source)
            if transcription:
                extract_data_from_text(transcription)
            else:
                print(f"No data extracted for {file}")

# List of audio file names
audio_files = ['audio1.wav', 'audio2.wav', 'audio3.wav', 'audio5.wav','audio6.wav']

# Execute the processing
process_files(audio_files)


  if pattern.startswith("(\d+) kg"):  # Reverse if the number comes before the name


Transcription for audio1.wav: the yield of w 704 today is 8 kg
Match found: Cow ID: 704, Milk Yield: 8 kg
Transcription for audio2.wav: babila has given 5 kg milk today
Match found: Cow ID: babila, Milk Yield: 5 kg
Transcription for audio3.wav: 6 kg milk given by Shanti
Match found: Cow ID: Shanti, Milk Yield: 6 kg
Transcription for audio5.wav: w 7049 Babulal both give 8 kg and 51 l of milk today
Match found: Cow IDs: 7049, Babulal, Milk Yields: 8 kg, 51 l
Transcription for audio6.wav: today I have Milkar 10 kg from Rosy and 10 kg from Shanti
