# Reading text files

In [1]:
import requests 
from bs4 import BeautifulSoup

In [2]:
# The text file includes structured and unstructured data, including a mix of plain text, key-value pairs, 
# nested-like structures, and irregular formatting.

# Path to save the file
file_path = './documents/text_file.txt'

import json

# Correcting the function to handle JSON-like content safely without using eval
def scrape_text_file_safe(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Extracted data structure
    extracted_data = {
        "Basic Information": {},
        "Projects": [],
        "Metadata": {},
        "Raw Data": None,
        "Irregular Data": [],
        "Mixed Content": {"JSON-like": {}, "Free Text": []},
    }

    # Temporary variables for processing
    current_section = None
    json_like_started = False
    json_like_buffer = []

    for line in lines:
        line = line.strip()
        
        if line.startswith("## Section"):
            # Identify sections by headers
            if "Basic Information" in line:
                current_section = "Basic Information"
            elif "Projects" in line:
                current_section = "Projects"
            elif "Raw Data" in line:
                current_section = "Raw Data"
            elif "Irregular Data" in line:
                current_section = "Irregular Data"
            elif "Mixed Content" in line:
                current_section = "Mixed Content"
            else:
                current_section = None

        elif current_section == "Basic Information" and ": " in line:
            # Parse key-value pairs for Basic Information
            key, value = line.split(": ", 1)
            extracted_data["Basic Information"][key.strip()] = value.strip()

        elif current_section == "Projects" and line.startswith("- Project"):
            # Start a new project
            extracted_data["Projects"].append({})
        
        elif current_section == "Projects" and ": " in line:
            # Parse project details
            key, value = line.split(": ", 1)
            extracted_data["Projects"][-1][key.strip()] = value.strip()

        elif current_section == "Raw Data":
            # Extract Raw Data content
            if line.startswith("@Metadata"):
                continue
            elif line.startswith("{") and "}" in line:
                key, value = line[1:-1].split(", ", 1)
                k1, v1 = key.split(": ")
                k2, v2 = value.split(": ")
                extracted_data["Metadata"][k1.strip()] = v1.strip()
                extracted_data["Metadata"][k2.strip()] = v2.strip()
            else:
                extracted_data["Raw Data"] = line

        elif current_section == "Irregular Data" and line.startswith("- Name"):
            # Parse Irregular Data rows
            parts = [p.strip() for p in line.split(" | ")]
            data_entry = {}
            for part in parts:
                if ": " in part:
                    key, value = part.split(": ", 1)
                    data_entry[key.strip()] = value.strip()
            extracted_data["Irregular Data"].append(data_entry)

        elif current_section == "Mixed Content":
            # Handle JSON-like and Free Text
            if line.startswith("{") or json_like_started:
                # Gather JSON-like content
                json_like_started = True
                json_like_buffer.append(line)
                if line.endswith("}"):
                    # Attempt to parse the gathered JSON-like structure
                    try:
                        extracted_data["Mixed Content"]["JSON-like"] = json.loads(
                            "\n".join(json_like_buffer).replace("'", '"')
                        )
                    except json.JSONDecodeError:
                        extracted_data["Mixed Content"]["JSON-like"] = {"error": "Invalid JSON-like structure"}
                    json_like_started = False
                    json_like_buffer = []
            else:
                # Collect free text content
                extracted_data["Mixed Content"]["Free Text"].append(line)

    return extracted_data

# Scraping the file safely
scraped_data_safe = scrape_text_file_safe(file_path)
scraped_data_safe



{'Basic Information': {'Name': 'John Doe',
  'Age': '35',
  'Occupation': 'Software Developer',
  'Skills': 'Python, JavaScript, SQL'},
 'Projects': [{'Role': 'Lead Developer',
   'Duration': '1 year',
   'Description': '"Developed a scalable e-commerce platform using Python and Django."'},
  {'Role': 'Full Stack Developer',
   'Duration': '8 months',
   'Description': '"Built a chat application using Node.js and React."'}],
 'Metadata': {'ID': '12345', 'Type': 'Confidential'},
 'Raw Data': '',
 'Irregular Data': [{'- Name': 'Alice Johnson',
   'Age': '29',
   'Occupation': 'Data Scientist'},
  {'- Name': 'Bob Smith', 'Age': '42', 'Occupation': 'Systems Analyst'}],
 'Mixed Content': {'JSON-like': {'error': 'Invalid JSON-like structure'},
  'Free Text': ['JSON-like:',
   ']',
   '}',
   '',
   'Free text: This is an example of a mixed-content section, where the text is interspersed with pseudo-structured data.',
   '',
   '--- End of File ---']}}

In [3]:
# let us first read the text file 
with open(file_path, 'r') as file:
    lines = file.readlines()

lines

['\n',
 '# Sample Data\n',
 '\n',
 '## Section 1: Basic Information\n',
 'Name: John Doe\n',
 'Age: 35\n',
 'Occupation: Software Developer\n',
 'Skills: Python, JavaScript, SQL\n',
 '\n',
 '## Section 2: Projects\n',
 '- Project 1: "E-commerce platform"\n',
 '  Role: Lead Developer\n',
 '  Duration: 1 year\n',
 '  Description: "Developed a scalable e-commerce platform using Python and Django."\n',
 '- Project 2: "Real-time Chat Application"\n',
 '  Role: Full Stack Developer\n',
 '  Duration: 8 months\n',
 '  Description: "Built a chat application using Node.js and React."\n',
 '\n',
 '## Section 3: Raw Data\n',
 '@Metadata\n',
 '  {ID: 12345, Type: Confidential}\n',
 '@Data\n',
 '  Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin mollis, libero at euismod auctor, sapien lorem aliquam mi, nec consequat lorem magna sit amet nisi.\n',
 '\n',
 '## Section 4: Irregular Data\n',
 '- Name: Alice Johnson  |  Age: 29 | Occupation: Data Scientist\n',
 '- Name: Bob Smith | Age: 42

In [10]:
# I see the sections start with ##
import re 

sections = [re.sub(r'## Section [0-9]: ', '', section.strip()) for section in lines if re.search(r'^##', section)]
sections 

['Basic Information',
 'Projects',
 'Raw Data',
 'Irregular Data',
 'Mixed Content']