In [3]:
import pandas as pd
import os
import re

# Define the NER tag mapping
ner_tags_mapping = {
    'employerName': 0,
    'employerAddressStreet_name': 1,
    'employerAddressCity': 2,
    'employerAddressState': 3,
    'employerAddressZip': 4,
    'einEmployerIdentificationNumber': 5,
    'employeeName': 6,
    'ssnOfEmployee': 7,
    'box1WagesTipsAndOtherCompensations': 8,
    'box2FederalIncomeTaxWithheld': 9,
    'box3SocialSecurityWages': 10,
    'box4SocialSecurityTaxWithheld': 11,
    'box16StateWagesTips': 12,
    'box17StateIncomeTax': 13,
    'taxYear': 14,
    'OTHER': 15  # Assuming OTHER is used for non-tagged elements
}

# Function to perform natural sorting
def natural_sort_key(s):
    # Split the filename into parts (numbers and text) to sort numerically and alphabetically
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

# Set the directory path containing TSV files
folder_path = '/home/aswin/Desktop/Infrrd/dataset__/dataset/train/boxes_transcripts_labels'
output_file = '/home/aswin/Desktop/Infrrd/output.txt'  # Output file path

# Initialize the id counter
id_counter = 0

# Get the list of TSV files and sort them naturally
file_list = sorted(
    [f for f in os.listdir(folder_path) if f.endswith('.tsv')],
    key=natural_sort_key
)

# Open the output file in write mode
with open(output_file, 'w') as out_file:
    # Iterate through each TSV file in the sorted list
    for filename in file_list:
        file_path = os.path.join(folder_path, filename)
        base_filename = filename.replace('.tsv', '.jpg')
        
        # Load the TSV file without headers and handle quotes correctly
        df = pd.read_csv(file_path, sep=',', header=None, names=['start_index', 'end_index', 'x_top_left', 'y_top_left', 'x_bottom_right', 'y_bottom_right', 'transcript', 'field'])
        
        # Remove quotes from the transcript column if they exist
        df['transcript'] = df['transcript'].str.replace('"', '')
        
        # Prepare the data structure
        data = {
            'id': id_counter,  # Use the current id counter value
            'file_name': base_filename,  # Use the extracted and modified file name
            'tokens': df['transcript'].tolist(),
            'bboxes': df[['x_top_left', 'y_top_left', 'x_bottom_right', 'y_bottom_right']].values.tolist(),
            'ner_tags': [ner_tags_mapping.get(tag, -1) for tag in df['field'].tolist()]  # Map fields to NER tags
        }
        
        # Write the dictionary as a string to the output file
        out_file.write(str(data) + '\n')  # Writes each data dictionary on a new line
        
        # Increment the id counter for the next file
        id_counter += 1



In [4]:
from sklearn.model_selection import train_test_split

# File paths
input_file = '/home/aswin/Desktop/Infrrd/output.txt'
train_file = '/home/aswin/Desktop/Infrrd/train.txt'
test_file = '/home/aswin/Desktop/Infrrd/test.txt'

# Step 1: Read data from output.txt
with open(input_file, 'r') as file:
    data = [line.strip() for line in file.readlines()]

# Step 2: Split the data into train and test sets
train_data, test_data = train_test_split(data, random_state=21, test_size=0.3)

# Step 3: Write train data to train.txt
with open(train_file, 'w') as file:
    for line in train_data:
        file.write(line + '\n')

# Step 4: Write test data to test.txt
with open(test_file, 'w') as file:
    for line in test_data:
        file.write(line + '\n')

print(f"Data successfully split into {train_file} and {test_file}.")


Data successfully split into /home/aswin/Desktop/Infrrd/train.txt and /home/aswin/Desktop/Infrrd/test.txt.
