### Read the scrapped files from wikipedia and extract history

In [17]:
import re

def find_history(wiki_content):
   match = re.search(r'==\s*History\s*==([\s\S]*?)(?=\n==[^=]+==)', wiki_content)
   if match:
      history_text = match.group(1).strip()  # Extract the matched text and strip leading/trailing whitespace
      return history_text
   else:
      return None
   

In [98]:
def remove_nested_braces(text):
    # Stack to track opening braces and a list for the result
    stack = []
    result = []
    i = 0

    while i < len(text):
        # Check for opening {{, push to stack when found
        if text[i:i+2] == "{{":
            if len(stack) == 0:  # Only start removing when we find the outermost {{
                result.append('')  # Start ignoring text within the outer block
            stack.append("{{")  # Push opening '{{' to stack
            i += 2  # Skip past the opening '{{'
        
        # Check for closing }}, pop from stack when found
        elif text[i:i+2] == "}}":
            if stack:
                stack.pop()  # Pop from the stack when closing '}}' is found
            if len(stack) == 0:  # When we reach the outermost closing '}}'
                result.append('')  # Add empty string to remove content
            i += 2  # Skip past the closing '}}'
        
        # If we are outside any {{}} block, add the character to the result
        else:
            if not stack:
                result.append(text[i])  # Only add text if not inside a {{}} block
            i += 1  # Move to the next character

    return ''.join(result)

In [109]:
def clean_history(history):
   text = history
   
   # with regex
   # Regular expression to remove the [[File: ... ]] pattern, including all inner content
   text = re.sub(r'\[\[File:[^\[\]]*(?:\[\[[^\[\]]*\]\][^\[\]]*)*\]\]', '', text) # file remove, nested
   text = re.sub(r'\[\[(.*?)\]\]', r'\1', text)  # changes text inside [[...]]
   text = remove_nested_braces(text)
   text = re.sub(r'{{(.*?)}}', '', text, flags=re.DOTALL)
   text = re.sub(r'<ref.*?>.*?</ref>', '', text, flags=re.DOTALL)
   text = re.sub(r'<gallery.*?>.*?</gallery>', '', text, flags=re.DOTALL)
   
   return text.strip()

with open("history/Miami.txt", 'r', encoding='utf-8') as file:
   file_content = file.read()
   cleaned = clean_history(file_content)
   with open("ana.txt", "w", encoding="utf-8") as file_out:
      file_out.write(cleaned)

In [None]:
import os

# Specify source and destination directories
source_dir = 'new_cities/'
destination_dir = 'history/'

# Create the destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Iterate over the files in the source directory
count = 0
for filename in os.listdir(source_dir):
   source_path = os.path.join(source_dir, filename)
   destination_path = os.path.join(destination_dir, filename)
   
   # Check if it's a file (not a directory)
   if os.path.isfile(source_path):
      # Open the file in read mode and read its content
      with open(source_path, 'r', encoding='utf-8') as file:
         file_content = file.read()
      
      history = find_history(file_content)
      if history is not None:
         count += 1
         with open(destination_path, "w", encoding="utf-8") as file:
            file.write(history)

   else:
      print(f"Skipped (not a file): {source_path}")

print("File transfer complete!")
print("Successfully extracted history:", count)


File transfer complete!
Successfully extracted history: 1957


### Clean the history text

In [113]:
history_dir = 'history/'

# Create the destination directory if it doesn't exist
os.makedirs(history_dir, exist_ok=True)

# Iterate over the files in the source directory
count = 0
for filename in os.listdir(history_dir):
   file_path = os.path.join(history_dir, filename)
   
   # Check if it's a file (not a directory)
   if os.path.isfile(file_path):
      # Open the file in read mode and read its content
      with open(file_path, 'r', encoding='utf-8') as file:
         file_content = file.read()
      history = clean_history(file_content)
      with open(file_path, "w", encoding="utf-8") as file:
         file.write(history)
         count += 1

   else:
      print(f"Skipped (not a file): {source_path}")

print("File transfer complete!")
print("Successfully extracted history:", count)


File transfer complete!
Successfully extracted history: 1957
