In [10]:
import json
import pandas as pd
import random
import string
import glob
import os
import re

In [None]:
txt_dir = "raw_txt"
output_csv_dir = "parsed_csv"
random_str = ''.join(random.choices(string.ascii_lowercase + string.digits, k=6))
output_csv_file= f'combined_dataset_{random_str}.csv'
output_csv = output_csv_dir+"/"+output_csv_file

# Find all TXT files
txt_files = glob.glob(os.path.join(txt_dir, '*.txt'))

# Initialize an empty list to store data
data_list = []

# Function to extract individual JSON objects robustly
def extract_json_objects(txt_content):
    txt_content = re.sub(r'.*?```json', '', txt_content, flags=re.DOTALL)
    json_objects = re.findall(r'\{.*?\}(?=,?\s*[\{\]])', txt_content, re.DOTALL)
    objects = []
    for obj_str in json_objects:
        try:
            obj = json.loads(obj_str)
            objects.append(obj)
        except json.JSONDecodeError as e:
            print(f"Skipped malformed JSON object: {e}")
    return objects

# Load each TXT file and append data
for file_path in txt_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        txt_content = file.read()
        extracted_objects = extract_json_objects(txt_content)
        data_list.extend(extracted_objects)

# Create DataFrame (automatically handles dynamic columns)
df = pd.DataFrame(data_list)

# Fill missing values with NaN
combined_df = df.where(pd.notnull(df), None)

# Save combined DataFrame to CSV
combined_df.to_csv(output_csv, index=False)

print(f"Combined CSV saved to: {output_csv}")