In [1]:
import pandas as pd
import json
import os
import re
from pathlib import Path
from urllib.parse import urlparse, parse_qs

In [2]:
def organize_har_by_terms(har_file, localidades, details_types, base_output_dir="har_results"):
    """
    Organize HAR file responses into folders and files based on two series of terms.
    Skips entries if no matching second term is found.
    
    Args:
        har_file: Path to .har file
        localidades: List of terms for folder-level organization
        details_types: List of terms for file-level naming
        base_output_dir: Base directory for output
    """
    
    try:
        # Load HAR file
        with open(har_file, 'r', encoding='utf-8') as f:
            har_data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File '{har_file}' not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: '{har_file}' is not a valid JSON file.")
        return
    
    # Create base output directory
    Path(base_output_dir).mkdir(exist_ok=True)
    
    # Extract entries from HAR file
    entries = har_data.get('log', {}).get('entries', [])
    
    if not entries:
        print("No entries found in HAR file.")
        return
    
    summary = {term: {} for term in localidades}
    total_saved = 0
    skipped_no_second_term = 0
    skipped_not_json = 0
    
    # First pass: organize by first terms (folders)
    for localidad in localidades:
        print(f"\n{'='*50}")
        print(f"Processing term: '{localidad}'")
        print(f"{'='*50}")
        
        # Create folder for this term
        folder_path = Path(base_output_dir) / sanitize_filename(localidad)
        folder_path.mkdir(exist_ok=True)
        
        # Track files for this term
        summary[localidad] = {
            'folder': str(folder_path),
            'files': {},
            'count': 0
        }
        
        # Find entries matching the first term
        matching_entries = []
        for entry in entries:
            request = entry.get('request', {})
            url = request.get('url', '').lower()
            method = request.get('method', '')
            
            # Check if first term is in URL or request
            if localidad.lower() in url and method == 'POST' :
                matching_entries.append((entry, url))
        
        print(f"Found {len(matching_entries)} entries containing '{localidad}'")
        
        # Second pass: for each matching entry, check second terms (file names)
        entries_saved_this_term = 0
        for entry, url in matching_entries:
            request = entry.get('request', {})
            response = entry.get('response', {})
            content = response.get('content', {})
            mime_type = content.get('mimeType', '').lower()
            text = content.get('text', '')
            
            # Only process JSON responses
            if not ('application/json' in mime_type or 'json' in mime_type):
                skipped_not_json += 1
                continue
            
            # Try to find a matching second term for file naming
            file_term = None
            for second_term in details_types:
                # Search in various places for the second term
                url_lower = url.lower()
                request_text = json.dumps(request).lower()
                response_text = json.dumps(response).lower()
                content_text = text.lower() if text else ""
                
                # Search in all relevant parts
                if second_term.lower() in url_lower:
                    file_term = second_term
                    break
            
            # SKIP if no second term found
            if not file_term:
                continue  # Skip this entry entirely
            
            # Sanitize file term for filename
            sanitized_file_term = sanitize_filename(file_term)
            
            # Check if file already exists with this name
            file_counter = 1
            base_filename = sanitized_file_term
            while True:
                filename = f"{base_filename}.json" if file_counter == 1 else f"{base_filename}_{file_counter}.json"
                filepath = folder_path / filename
                if not filepath.exists():
                    break
                file_counter += 1
            
            # Extract JSON content
            try:
                if text:
                    json_content = json.loads(text)
                else:
                    json_content = {}
                
                # Add metadata
                result = {
                    'url': url,
                    'method': request.get('method', ''),
                    'status': response.get('status', 0),
                    'timestamp': entry.get('startedDateTime', ''),
                    'folder_term': localidad,
                    'file_term': file_term,
                    'content': json_content
                }
                
                # Save to file
                with open(filepath, 'w', encoding='utf-8') as f:
                    json.dump(json_content, f, indent=2, ensure_ascii=False)
                
                # Update summary
                if file_term not in summary[localidad]['files']:
                    summary[localidad]['files'][file_term] = 0
                summary[localidad]['files'][file_term] += 1
                summary[localidad]['count'] += 1
                total_saved += 1
                entries_saved_this_term += 1
                
                print(f"  ✓ Saved: {filename} (matched: '{file_term}')")
                
            except json.JSONDecodeError:
                print(f"  ✗ Could not parse JSON from: {url[:80]}...")
            except Exception as e:
                print(f"  ✗ Error saving file: {str(e)}")
        
        if entries_saved_this_term == 0:
            print(f"  No entries saved for '{localidad}' (no matching second terms found)")
    
    # Print summary
    print(f"\n{'='*60}")
    print("ORGANIZATION SUMMARY")
    print(f"{'='*60}")
    print(f"Total entries processed: {len(entries)}")
    print(f"Total JSON responses saved: {total_saved}")
    print(f"Skipped (no second term): {skipped_no_second_term}")
    print(f"Skipped (not JSON): {skipped_not_json}")
    print(f"Base output directory: {os.path.abspath(base_output_dir)}")
    print()
    
    # Only show folders that have saved files
    for localidad in localidades:
        if summary[localidad]['count'] > 0:
            print(f"Folder: '{localidad}' ({summary[localidad]['count']} files)")
            print(f"  Path: {summary[localidad]['folder']}")
            
            if summary[localidad]['files']:
                print("  Files organized by second term:")
                for file_term, count in summary[localidad]['files'].items():
                    print(f"    - '{file_term}': {count} file(s)")
            print()
    
    # Print empty folders warning
    empty_folders = [term for term in localidades if summary[term]['count'] == 0]
    if empty_folders:
        print(f"Empty folders (no matches with second terms): {', '.join(empty_folders)}")

def sanitize_filename(filename):
    """Convert a string to a safe filename"""
    # Replace problematic characters
    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
    # Remove control characters
    filename = re.sub(r'[\x00-\x1f\x7f]', '', filename)
    # Limit length
    filename = filename[:100]
    # Remove leading/trailing dots and spaces
    filename = filename.strip('. ')
    # If empty after sanitization, use a default
    if not filename:
        filename = "file"
    return filename

# Alternative version with more options

def save_har_entry(entry, localidad, file_term, folder_path, stats):
    """Save a single HAR entry to file"""
    request = entry.get('request', {})
    response = entry.get('response', {})
    url = request.get('url', '')
    content = response.get('content', {})
    text = content.get('text', '')
    
    # Create filename
    base_name = sanitize_filename(file_term)
    counter = 1
    while True:
        filename = f"{base_name}.json" if counter == 1 else f"{base_name}_{counter}.json"
        filepath = folder_path / filename
        if not filepath.exists():
            break
        counter += 1
    
    # Prepare data
    try:
        json_content = json.loads(text) if text else {}
    except:
        json_content = text
    
    data = {
        'metadata': {
            'url': url,
            'method': request.get('method'),
            'status': response.get('status'),
            'timestamp': entry.get('startedDateTime'),
            'folder_term': localidad,
            'file_term': file_term
        },
        'content': json_content
    }
    
    # Save file
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        # Update stats
        if file_term not in stats['by_folder'][localidad]['files']:
            stats['by_folder'][localidad]['files'][file_term] = 0
        stats['by_folder'][localidad]['files'][file_term] += 1
        
        print(f"  ✓ {filename}")
        return True
    except Exception as e:
        print(f"  ✗ Error saving {filename}: {str(e)}")
        return False

def extract_fallback_name(url, request, response):
    """Extract a fallback name when no second term is found"""
    from urllib.parse import urlparse
    
    parsed = urlparse(url)
    path_parts = [p for p in parsed.path.split('/') if p]
    
    if path_parts:
        last_part = path_parts[-1]
        # Remove query string and fragments
        last_part = last_part.split('?')[0].split('#')[0]
        # Remove file extension
        last_part = re.sub(r'\.[a-zA-Z0-9]+$', '', last_part)
        if last_part:
            return last_part
    
    # Use method + status
    method = request.get('method', 'unknown').lower()
    status = response.get('status', 0)
    return f"{method}_{status}"

def print_summary(stats, base_output_dir):
    """Print organization summary"""
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    print(f"Total entries processed: {stats['total_processed']}")
    print(f"Saved: {stats['saved']}")
    print(f"Skipped - no second term: {stats['skipped_no_second_term']}")
    print(f"Skipped - not JSON: {stats['skipped_not_json']}")
    print(f"Output directory: {os.path.abspath(base_output_dir)}")
    
    print("\nBy folder:")
    for folder, data in stats['by_folder'].items():
        if data['count'] > 0:
            print(f"  {folder}: {data['count']} files")
            for file_term, count in data['files'].items():
                print(f"    {file_term}: {count}")

# def main():
#     # Example configuration
#     har_file = "../sources/har/airdna.har"
    
#     # First series of terms - creates folders
#     localidades = ["142649","142651"]
    
#     # Second series of terms - names files (must be found to save entry)
#     details_types = ["listing_type","bedrooms","minimum_stay"]
    
#     # Run organization
#     print("Organizing HAR file (skipping entries without second term)...")
#     organize_har_by_terms(
#         har_file=har_file,
#         localidades=localidades,
#         details_types=details_types,
#         base_output_dir="har_organized"
#     )

# if __name__ == "__main__":
#     # Command line usage
#     import sys
#     if len(sys.argv) > 1:
#         har_file = sys.argv[1]
        
#         # Parse terms from command line
#         if len(sys.argv) > 2:
#             localidades = [t.strip() for t in sys.argv[2].split(',')]
#         else:
#             localidades = ["api", "user"]
            
#         if len(sys.argv) > 3:
#             details_types = [t.strip() for t in sys.argv[3].split(',')]
#         else:
#             details_types = ["profile", "settings"]
            
#         base_output_dir = sys.argv[4] if len(sys.argv) > 4 else "har_results"
        
#         organize_har_by_terms(har_file, localidades, details_types, base_output_dir)
#     else:
#         main()

In [7]:
har_file = "../sources/har/airdna.har"
    
localidades = pd.read_csv('../sources/localidades.csv')['id']
localidades = localidades.astype('str')
    
    # Second series of terms - names files (must be found to save entry)
details_types = ["listing_type","bedrooms","minimum_stay"]

localidades

0     142664
1     248952
2     142650
3     142649
4     142663
5     142654
6     142660
7     142658
8     249130
9     142656
10    141883
11    142652
12    142661
13    141029
14    142655
15    142659
16    142651
17    142665
Name: id, dtype: object

In [8]:
print("Organizing HAR file (skipping entries without second term)...")
organize_har_by_terms(
    har_file='../sources/har/chapinero.har',
    localidades=localidades,
    details_types=details_types,
    base_output_dir="../sources"
)

Organizing HAR file (skipping entries without second term)...

Processing term: '142664'
Found 0 entries containing '142664'
  No entries saved for '142664' (no matching second terms found)

Processing term: '248952'
Found 0 entries containing '248952'
  No entries saved for '248952' (no matching second terms found)

Processing term: '142650'
Found 0 entries containing '142650'
  No entries saved for '142650' (no matching second terms found)

Processing term: '142649'
Found 0 entries containing '142649'
  No entries saved for '142649' (no matching second terms found)

Processing term: '142663'
Found 0 entries containing '142663'
  No entries saved for '142663' (no matching second terms found)

Processing term: '142654'
Found 0 entries containing '142654'
  No entries saved for '142654' (no matching second terms found)

Processing term: '142660'
Found 0 entries containing '142660'
  No entries saved for '142660' (no matching second terms found)

Processing term: '142658'
Found 0 entries