In [None]:
import csv

# Path to the input text file and output CSV file
input_file_path = '/content/Arya extraction.txt'
output_csv_path = '/content/output.csv'

# Open the input file and read the content
with open(input_file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Split the content based on the pattern "========================================"
chunks = content.split("========================================")

# Remove leading/trailing whitespaces from each chunk
chunks = [chunk.strip() for chunk in chunks if chunk.strip()]

# Write the chunks to a CSV file, each chunk in a new row
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)

    # Write each chunk as a separate row in the CSV
    for chunk in chunks:
        writer.writerow([chunk])

print(f"Text chunks have been written to {output_csv_path}")


Text chunks have been written to /content/output.csv


In [None]:
import csv
import re

# Function to extract the relevant substring from image link
def extract_image_name(image_link):
    return image_link.split('/')[-1]  # Get the part after the last "/"

# Function to extract entity value based on entity_name from text
def extract_entity_value(entity_name, text):
    # Define extended patterns for different entities
    patterns = {
        'weight': r'(\d+(?:[\.,]\d+)?)\s*(G|GRAMS?|GMS?|KG|KILOGRAMS?|OZ|OUNCES?)',
        'width': r'(\d+(?:[\.,]\d+)?)\s*(CM|MM|M|METERS?|KM|KILOMETERS?|INCH(?:ES)?|IN|FT|FEET)',
        'height': r'(\d+(?:[\.,]\d+)?)\s*(CM|MM|M|METERS?|KM|KILOMETERS?|INCH(?:ES)?|IN|FT|FEET)',
        'length': r'(\d+(?:[\.,]\d+)?)\s*(CM|MM|M|METERS?|KM|KILOMETERS?|INCH(?:ES)?|IN|FT|FEET)',
        'power': r'(\d+(?:[\.,]\d+)?)\s*(W|WATT(?:S)?)',
        'voltage': r'(\d+(?:[\.,]\d+)?)\s*(V|VOLTS?)',
        'speed': r'(\d+(?:[\.,]\d+)?)\s*(KM/H|KMH|MPH|MILES? PER HOUR|KILOMETERS? PER HOUR)',
        'volume': r'(\d+(?:[\.,]\d+)?)\s*(L|LITERS?|ML|MILLILITERS?)',
    }

    # Match the correct pattern based on the entity name
    pattern = patterns.get(entity_name.lower(), None)
    if pattern:
        matches = re.findall(pattern, text, re.IGNORECASE)
        # Format the results as "value unit"
        return ', '.join([f"{value.replace(',', '.')} {unit.upper()}" for value, unit in matches])

    return ""

# Step 1: Read test.csv and output.csv
test_csv_path = 'test.csv'
output_csv_path = 'output.csv'
output_dev_csv_path = 'test_dev.csv'

# Load test.csv into a list of dictionaries
with open(test_csv_path, 'r', encoding='utf-8') as test_file:
    test_data = list(csv.DictReader(test_file))

# Load output.csv into a list
with open(output_csv_path, 'r', encoding='utf-8') as output_file:
    output_data = [row['column_name'] for row in csv.DictReader(output_file)]  # Assuming column header as 'column_name'

# Step 2: Process each row of test.csv and match it with output.csv
with open(output_dev_csv_path, 'w', newline='', encoding='utf-8') as dev_file:
    fieldnames = ['index', 'prediction']
    writer = csv.DictWriter(dev_file, fieldnames=fieldnames)
    writer.writeheader()  # Write the header

    # Loop through each row in test.csv
    for row in test_data:
        index = row['index']
        image_link = row['image_link']
        entity_name = row['entity_name']

        # Extract the image name from the image link
        image_name = extract_image_name(image_link)
        prediction = ""

        # Step 3: Search for a matching entry in output.csv
        matching_text = next((text for text in output_data if image_name in text), None)

        # If a match is found, extract the entity value
        if matching_text:
            prediction = extract_entity_value(entity_name, matching_text)

        # Step 4: Write the index and prediction to the test_dev.csv file
        writer.writerow({'index': index, 'prediction': prediction if prediction else ""})
