In [20]:
import json

In [21]:
# Step 1: Load the JSON file
def load_json_file(json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    return data

# Step 2: Read the text file into a single string
def load_text_file(text_file_path):
    with open(text_file_path, 'r') as file:
        text = file.read().lower()  # Convert text to lowercase for case-insensitive comparison
    return text

# Step 3: Function to check if each label keyword is found in the text
def find_missing_keywords(label_keywords, text):
    # Return a list of keywords that are not found in the text
    missing_keywords = [keyword for keyword in label_keywords if keyword.lower() not in text]
    return missing_keywords

# Step 4: Process the JSON and check each label's keywords in the text
def process_labels(json_data, text):
    results = []
    for label in json_data['labels']:
        label_id = label['label_id']
        keywords = label['label_keywords']
        
        # Find missing keywords
        missing_keywords = find_missing_keywords(keywords, text)
        
        # Only include labels that have missing keywords
        if missing_keywords:
            results.append({
                'label_id': label_id,
                'missing_keywords': missing_keywords  # Include only if there are missing keywords
            })
    
    return results

# Step 5: Save the results to a new JSON file or display them
def save_results_to_json(results, output_file):
    with open(output_file, 'w') as file:
        json.dump(results, file, indent=4)

In [22]:
json_file_path = 'label.json'  # Replace with your actual JSON file path
text_file_path = 'hmns_1.txt'  # Replace with your actual text file path
output_file_path = 'output_results.json'  # Path to save the results

In [23]:
# Load the data
json_data = load_json_file(json_file_path)
text_data = load_text_file(text_file_path)

# Process the labels and check keywords
results = process_labels(json_data, text_data)

# Save the results
save_results_to_json(results, output_file_path)


# Clean the text

In [35]:
# Step 1: Function to load the data from a text file
def load_text_file(file_path):
    with open(file_path, 'r') as file:
        data = file.read()
    return data

# Step 2: Function to process the data (split by ---new item--- and clean each item)
def split_data(data):
    # Split data by the "---new item--" marker
    items = data.split("---new item--")
    
    # Clean up each item by stripping excess newlines and spaces
    processed_items = []
    for item in items:
        cleaned_item = " ".join(item.strip().splitlines())
        if cleaned_item:
            processed_items.append(cleaned_item)
    
    return processed_items

# Step 3: Function to write processed data line by line into a new text file
def write_to_text_file(processed_items, output_file_path):
    with open(output_file_path, 'w', newline='\n') as file:
        for item in processed_items:
            # Write each item followed by a new line and the "---new item---" marker
            file.write(item + '\n\n')  # Ensure two newlines after the item content
            file.write('---new item---\n\n')  # Ensure three newlines around the marker for clearer separation

# Step 4: Main function to load, process, and write data
def main(input_file_path, output_file_path):
    # Load the raw data from the input text file
    data = load_text_file(input_file_path)
    
    # Process the data by splitting and cleaning it
    processed_items = split_data(data)
    
    # Write the processed data to the output text file, line by line
    write_to_text_file(processed_items, output_file_path)

# Step 5: Specify the file paths and run the script
input_file_path = 'hmns_1.txt'  # Replace with the path to your input file
output_file_path = 'hmns_1_updated.txt'  # Replace with the desired output file path

# Run the script
main(input_file_path, output_file_path)


# Clean json

In [44]:
import json

# Load the JSON data from a file
with open('label.json', 'r') as f:
    data = json.load(f)

# Function to clean the labels by removing 'minKeywords' key
def clean_labels(data):
    for label in data.get('labels', []):
        # Remove the 'minKeywords' key if it exists
        if 'minKeywords' in label:
            del label['minKeywords']
        # You can also check for 'label_keywords' if needed and add logic to handle that
        if 'label_keywords' in label:
            # If 'label_keywords' is required and missing, add a default value (empty or calculated)
            del label['label_keywords']
    return data

# Clean the data
cleaned_data = clean_labels(data)

# Save the cleaned JSON back to a file
with open('cleaned_label.json', 'w') as f:
    json.dump(cleaned_data, f, indent=4)

print("Cleaning process completed. Cleaned data saved to 'cleaned_label.json'")


Cleaning process completed. Cleaned data saved to 'cleaned_label.json'
