# Search for Keywords

The following cell executes a recursive wildcard search across all files and displays a preview with highlighted matches in a DataFrame.


In [None]:
import os
import re
import pandas as pd
from pathlib import Path
from IPython.display import display, HTML

base_path = r''

wildcard_keywords = [
    "terror*"
]

# "alien*", "deep state", "fake", "ufo*", "conspir*", "fib", "iaa", "reptil*"

def wildcard_to_regex(pattern):
    regex_pattern = re.escape(pattern).replace(r'\*', '.*').replace(r'\?', '.')
    return re.compile(f"({regex_pattern})", re.IGNORECASE)

def search_with_highlights(root_dir, patterns):
    results = []
    root_path = Path(root_dir)
    extensions = {'.oxt', '.txt', '.csv'}
    compiled_regexes = {p: wildcard_to_regex(p) for p in patterns}
    
    for file_path in root_path.rglob('*'):
        if file_path.suffix.lower() in extensions:
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                
                for original_pattern, regex in compiled_regexes.items():
                    match = regex.search(content)
                    if match:
                        start, end = match.span()
                        matched_text = match.group(1)
                        
                        c_start = max(0, start - 50)
                        c_end = min(len(content), end + 100)
                        
                        before = content[c_start:start].replace('\n', ' ')
                        after = content[end:c_end].replace('\n', ' ')
                        
                        highlighted_context = f"[...] {before}<b>{matched_text}</b>{after} [...]"
                        
                        results.append({
                            'File': file_path.name,
                            'Path': file_path.relative_to(root_path),
                            'Pattern': original_pattern,
                            'Match': matched_text,
                            'Context': highlighted_context
                        })
            except Exception as e:
                pass
    
    return pd.DataFrame(results)

df_results = search_with_highlights(base_path, wildcard_keywords)

if not df_results.empty:
    print(f"Matches: {len(df_results)}")
    display(HTML(df_results.to_html(escape = False)))
else:
    print("No matches.")


# Group Matches by Folder

The following cell analyzes the distribution of matches by grouping them according to their directory paths to identify content hotspots.


In [None]:
df_results['Folder'] = df_results['Path'].apply(lambda x: str(x).split(os.sep)[0])
stats = df_results['Folder'].value_counts().to_frame('Matches')

print("Matches by folder:")
display(stats)


# Export Matches

The following cell cleans the HTML formatting from the results and exports the final dataset.


In [None]:
def clean_html(raw_html):
    if not isinstance(raw_html, str): return raw_html
    return re.sub('<.*?>', '', raw_html)

if not df_results.empty:
    df_export = df_results.copy()
    df_export['Context'] = df_export['Context'].apply(clean_html)
    
    csv_name = "gta_conspiracy_matches.csv"
    
    df_export.to_csv(csv_name, index = False, encoding = 'utf-8-sig')
    
    abs_path = os.path.join(os.getcwd(), csv_name)
    print("-" * 50)
    print(f"File saved as: {abs_path}")
    print("-" * 50)
