In [None]:
import pandas as pd
import numpy as np
import re
import requests
from PIL import Image
from io import BytesIO
import easyocr
from tqdm import tqdm
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

def download_image(image_url):
    try:
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content)).convert('RGB')
        return image
    except Exception as e:
        print(f"Error downloading image: {e}")
        return None

def extract_text_from_image(image):
    try:
        result = reader.readtext(np.array(image))
        return ' '.join([text for _, text, _ in result])
    except Exception as e:
        print(f"Error extracting text from image: {e}")
        return ""

def extract_number(text):
    numbers = re.findall(r'\d+\.?\d*', text)
    return float(numbers[0]) if numbers else None

def process_row(row, most_frequent_units):
    index = row['index']
    entity_name = row['entity_name']
    image_url = row['image_link']

    image = download_image(image_url)
    if image is not None:
        extracted_text = extract_text_from_image(image)
        number = extract_number(extracted_text)
        
        if number is not None and entity_name in most_frequent_units:
            unit = most_frequent_units[entity_name]
            prediction = f"{number:.2f} {unit}"
        else:
            prediction = ""
    else:
        prediction = ""

    return {
        'index': index,
        'prediction': prediction
    }

def main():
    # Load data
    train_data = pd.read_csv('/kaggle/input/amazon-ml-cleaned/train_clean.csv')
    test_data = pd.read_csv('/kaggle/input/amazon-ml/test.csv')
    test_data = test_data.iloc[65594:][::-1]

    print(f"Training data size: {len(train_data)}")
    print(f"Test data size: {len(test_data)}")

    # Analyze most frequent unit for each entity_name
    entity_unit_freq = defaultdict(lambda: defaultdict(int))
    for _, row in train_data.iterrows():
        entity_name = row['entity_name']
        entity_value = row['entity_value']
        unit = re.findall(r'[a-zA-Z]+', entity_value)[-1] if re.findall(r'[a-zA-Z]+', entity_value) else ''
        entity_unit_freq[entity_name][unit] += 1

    most_frequent_units = {entity: max(units, key=units.get) 
                           for entity, units in entity_unit_freq.items()}

    print("\nMost frequent units for each entity:")
    for entity, unit in most_frequent_units.items():
        print(f"{entity}: {unit}")

    # Process test data in batches
    batch_size = 100
    predictions = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for i in range(0, len(test_data), batch_size):
            batch = test_data.iloc[i:i+batch_size]
            for _, row in batch.iterrows():
                futures.append(executor.submit(process_row, row, most_frequent_units))

        for future in tqdm(as_completed(futures), total=len(test_data), desc="Processing test data"):
            predictions.append(future.result())

    # Save predictions
    output_df = pd.DataFrame(predictions)
    output_df.to_csv('submission.csv', index=False)
    print("\nPredictions saved to submission.csv")
    # Display the first few rows of the output
    print("\nFirst few rows of the output:")
    print(output_df.head(30).to_string())

    # Verify format
    valid_format = output_df['prediction'].apply(lambda x: bool(re.match(r'^\d+\.\d{2} [a-zA-Z]+$', x)) if x else True).all()
    print(f"\nOutput format is {'valid' if valid_format else 'invalid'}")

if __name__ == "__main__":
    main()

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


Training data size: 257652
Test data size: 65593

Most frequent units for each entity:
item_weight: gram
item_volume: millilitre
voltage: volt
wattage: watt
maximum_weight_recommendation: kilogram
height: centimetre
depth: centimetre
width: centimetre
Error extracting text from image: CUDA out of memory. Tried to allocate 1.39 GiB. GPU 0 has a total capacity of 15.89 GiB of which 959.12 MiB is free. Process 8158 has 14.95 GiB memory in use. Of the allocated memory 14.63 GiB is allocated by PyTorch, and 18.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Error extracting text from image: CUDA out of memory. Tried to allocate 1.56 GiB. GPU 0 has a total capacity of 15.89 GiB of which 959.12 MiB is free. Process 8158 has 14.95 GiB memory in use. Of the

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


In [None]:
test_data = test_data.iloc[65594:][::-1]

In [None]:
import pandas as pd
import numpy as np
import re
import requests
from PIL import Image
from io import BytesIO
import easyocr
from tqdm import tqdm
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

def download_image(image_url):
    try:
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content)).convert('RGB')
        return image
    except Exception as e:
        print(f"Error downloading image: {e}")
        return None

def extract_text_from_image(image):
    try:
        result = reader.readtext(np.array(image))
        return ' '.join([text for _, text, _ in result])
    except Exception as e:
        print(f"Error extracting text from image: {e}")
        return ""

def extract_number(text):
    numbers = re.findall(r'\d+\.?\d*', text)
    return float(numbers[0]) if numbers else None

def process_row(row, most_frequent_units):
    index = row['index']
    entity_name = row['entity_name']
    image_url = row['image_link']

    image = download_image(image_url)
    if image is not None:
        extracted_text = extract_text_from_image(image)
        number = extract_number(extracted_text)
        
        if number is not None and entity_name in most_frequent_units:
            unit = most_frequent_units[entity_name]
            prediction = f"{number:.2f} {unit}"
        else:
            prediction = ""
    else:
        prediction = ""

    return {
        'index': index,
        'prediction': prediction
    }

def main():
    # Load data
    train_data = pd.read_csv('/kaggle/input/amazon-ml-cleaned/train_clean.csv')
    test_data = pd.read_csv('/kaggle/input/amazon-ml/test.csv')
     # Adjust to start from a specific index

    print(f"Training data size: {len(train_data)}")
    print(f"Test data size: {len(test_data)}")

    # Analyze most frequent unit for each entity_name
    entity_unit_freq = defaultdict(lambda: defaultdict(int))
    for _, row in train_data.iterrows():
        entity_name = row['entity_name']
        entity_value = row['entity_value']
        unit = re.findall(r'[a-zA-Z]+', entity_value)[-1] if re.findall(r'[a-zA-Z]+', entity_value) else ''
        entity_unit_freq[entity_name][unit] += 1

    most_frequent_units = {entity: max(units, key=units.get) 
                           for entity, units in entity_unit_freq.items()}

    print("\nMost frequent units for each entity:")
    for entity, unit in most_frequent_units.items():
        print(f"{entity}: {unit}")

    # Reverse the test_data so it processes from last to first row
    test_data = test_data.iloc[::-1]

    # Check if the submission file already exists and load it to get the last processed index
    output_file = 'submission.csv'
    if pd.io.common.file_exists(output_file):
        existing_df = pd.read_csv(output_file)
        last_processed_index = existing_df['index'].min()  # Get the smallest processed index
        print(f"Resuming from index {last_processed_index - 1} (backward)")
        test_data = test_data[test_data['index'] < last_processed_index]  # Filter rows above the last processed index
    else:
        print("No previous submission file found. Starting from the last row.")
        last_processed_index = float('inf')

    # Process test data in batches and save after every 1000 predictions
    batch_size = 100
    save_every = 1000
    predictions = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for i in tqdm(range(0, len(test_data), batch_size)):
            batch = test_data.iloc[i:i+batch_size]
            for _, row in batch.iterrows():
                futures.append(executor.submit(process_row, row, most_frequent_units))

            # Collect predictions in batches
            for future in as_completed(futures):
                predictions.append(future.result())

            # Save every 'save_every' rows
            if len(predictions) >= save_every or i + batch_size >= len(test_data):
                output_df = pd.DataFrame(predictions)
                # Append to CSV file
                output_df.to_csv(output_file, mode='a', header=not pd.io.common.file_exists(output_file), index=False)
                print(f"\nSaved {len(predictions)} rows to {output_file}")

                # Clear predictions for the next batch
                predictions = []

    print("\nPredictions saved to submission.csv")
    
    # Verify format of the output (optional)
    output_df = pd.read_csv(output_file)
    valid_format = output_df['prediction'].apply(lambda x: bool(re.match(r'^\d+\.\d{2} [a-zA-Z]+$', x)) if x else True).all()
    print(f"\nOutput format is {'valid' if valid_format else 'invalid'}")

if __name__ == "__main__":
    main()
