In [3]:
# This script processes a JSON Lines file to extract specific text data related to MOFs (Metal-Organic Frameworks), 
# finds the longest text entry, and formats the extracted data into a new JSON Lines file.

import json
mat_data = []

def filter_json_data(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            if 'text' in data and ("(MOF)" in data['text'] or "(MOFs)" in data['text'] or "MOF-" in data['text'] or "aterial" in data['text']):
                mat_data.append(data['text'])

# Example call
file_path = r'D:\项目记录\LLM数据\mat\train-00002-of-00020.json\s2ag.train.json'
filter_json_data(file_path)

max_length = 0
max_index = 0
for idx, text in enumerate(mat_data):
    if len(text) > max_length:
        max_length = len(text)
        max_index = idx

print("The index of the longest element is:", max_index)

import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
a = nlp(mat_data[max_index])
len(a)

# Add '{"content": "' prefix and '", "}' suffix to each string, preserving '\n'
modified_strings = ['{"content": "' + text.replace('\\', '\\\\').replace('"', '\\"').replace('</p>', ' ').replace('<p>', ' ').replace('\n', '\\n') + '"}' for text in mat_data]

# Write each modified string as a separate line in a JSON file
output_file_path = r'D:\项目记录\LLM数据\mat_jsonl\3.jsonl'
with open(output_file_path, 'w', encoding='utf-8') as file:
    for line in modified_strings:
        file.write(f"{line}\n")

print(f"Modified strings have been written in JSON format to '{output_file_path}'.")


The index of the longest element is: 40923
Modified strings have been written in JSON format to 'D:\项目记录\LLM数据\mat_jsonl\3.jsonl'.


In [None]:
# This script validates a JSON Lines file by checking each line for proper JSON syntax. 
# If any lines contain errors, they are deleted, and a summary of the process is printed.

import json
import os

def validate_jsonl(file_path):
    lines_to_delete = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line_number, line in enumerate(file, start=1):
                original_line = line.strip()
                try:
                    json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Error in line {line_number}: {e}")
                    print(f"Error line content: {original_line}")
                    lines_to_delete.append(line_number)

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return False

    # Delete erroneous lines
    if lines_to_delete:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        with open(file_path, 'w', encoding='utf-8') as file:
            for line_number, line in enumerate(lines, start=1):
                if line_number not in lines_to_delete:
                    file.write(line)

        print(f"Deleted {len(lines_to_delete)} lines with errors.")
    
    print(f"JSON Lines file '{file_path}' syntax is valid after processing!")
    return True

# Specify your JSON Lines file path
jsonl_file_path = "path/to/your/jsonl_file.jsonl"
validate_jsonl(jsonl_file_path)

In [36]:
# This script splits a JSON Lines file into smaller chunks of a specified size. 
# Each chunk is saved as a separate JSON Lines file with a given prefix.

import json

def split_jsonl(input_file, output_prefix, chunk_size):
    with open(input_file, 'r', encoding='utf-8') as in_file:
        data = []
        file_number = 1
        for line in in_file:
            data.append(json.loads(line))
            if len(data) == chunk_size:
                with open(f'{output_prefix}_{file_number}.jsonl', 'w', encoding='utf-8') as out_file:
                    for item in data:
                        out_file.write(json.dumps(item, ensure_ascii=False) + '\n')
                data = []
                file_number += 1
        if data:
            with open(f'{output_prefix}_{file_number}.jsonl', 'w', encoding='utf-8') as out_file:
                for item in data:
                    out_file.write(json.dumps(item, ensure_ascii=False) + '\n')

# Example usage
input_file = 'path/to/your/input_file.jsonl'
output_prefix = 'output'
chunk_size = 200
split_jsonl(input_file, output_prefix, chunk_size)

In [1]:
# This script validates a JSON Lines file by checking each line for valid JSON syntax. 
# It removes any lines that contain errors and reports the number of deleted lines.

import json
import os

def validate_jsonl(file_path):
    lines_to_delete = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line_number, line in enumerate(file, start=1):
                original_line = line.strip()
                try:
                    json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Error in line {line_number}: {e}")
                    print(f"Error line content: {original_line}")
                    lines_to_delete.append(line_number)

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return False

    # Remove erroneous lines
    if lines_to_delete:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        with open(file_path, 'w', encoding='utf-8') as file:
            for line_number, line in enumerate(lines, start=1):
                if line_number not in lines_to_delete:
                    file.write(line)

        print(f"Deleted {len(lines_to_delete)} lines with errors.")
    
    print(f"JSON Lines file '{file_path}' syntax is valid after processing!")
    return True

# Specify your JSON Lines file path
jsonl_file_path = "path/to/your/jsonl_file.jsonl"
validate_jsonl(jsonl_file_path)

JSON Lines file 'D:\项目记录\LLM数据\without_doi_2000MOFs.jsonl' syntax is valid after processing!


True