# RVX Data Catalog & Exploration

Comprehensive exploration of all datasets in the RVX folders:
- `traveling_survey/` - National travel survey data
- `zonal_register_data/` - Zonal statistical data (SDAT files)

In [1]:
import os
from pathlib import Path
import pandas as pd
import json
from datetime import datetime
from collections import defaultdict

# Setup
from synlab.utils import get_project_root

project_root = get_project_root()
rvx_path = project_root / 'data' / 'raw' / 'population' / 'rvx'

print(f"Project root: {project_root}")
print(f"RVX path: {rvx_path}")
print(f"Path exists: {rvx_path.exists()}")

Project root: /Users/anderskielland/Documents/Synthetic data/code/synthetic-lab
RVX path: /Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/data/raw/population/rvx
Path exists: True


## 1. Folder Overview

List all files and their sizes in both folders.

In [2]:
def get_folder_structure(folder_path):
    """
    Walk through folder and collect all files with metadata.
    Returns list of dicts with file info.
    """
    files_info = []
    
    for root, dirs, files in os.walk(folder_path):
        # Skip .DS_Store and other system files
        files = [f for f in files if not f.startswith('.')]
        
        for file in files:
            file_path = Path(root) / file
            size_bytes = os.path.getsize(file_path)
            size_mb = size_bytes / (1024 * 1024)
            
            rel_path = file_path.relative_to(folder_path)
            
            files_info.append({
                'filename': file,
                'relative_path': str(rel_path),
                'full_path': str(file_path),
                'size_bytes': size_bytes,
                'size_mb': round(size_mb, 2),
                'extension': Path(file).suffix
            })
    
    return sorted(files_info, key=lambda x: x['filename'])

# Explore both folders
folders = ['traveling_survey', 'zonal_register_data']
all_files = {}

for folder_name in folders:
    folder_path = rvx_path / folder_name
    print(f"\n{'='*80}")
    print(f"üìÅ {folder_name.upper()}")
    print(f"{'='*80}")
    
    files = get_folder_structure(folder_path)
    all_files[folder_name] = files
    
    print(f"\nTotal files: {len(files)}")
    print(f"\nFile listing:")
    print("-" * 100)
    
    for f in files:
        print(f"{f['filename']:<60} {f['size_mb']:>10.2f} MB  {f['extension']}")
    
    # Summary by extension
    by_ext = defaultdict(int)
    for f in files:
        by_ext[f['extension']] += 1
    
    print(f"\nBy extension:")
    for ext, count in sorted(by_ext.items()):
        print(f"  {ext if ext else '[no ext]':<15} {count:>3} files")


üìÅ TRAVELING_SURVEY

Total files: 8

File listing:
----------------------------------------------------------------------------------------------------
Filemail.com - Nasjonal RVU akkumulert data.zip                  228.44 MB  .zip
Filemail.com - RVU 2025.zip                                      128.01 MB  .zip
Nasjonal_RVU_PERSON_Nov26_0901.sav                                63.65 MB  .sav
Nasjonal_RVU_REISER_Nov26_0901.sav                                64.36 MB  .sav
Oppdatert skjema RVU_2025.docx                                     0.37 MB  .docx
RVU 2019-2024 Personfil Vektet 251125.sav                         93.56 MB  .sav
RVU 2019_2024 Reisefil 251107.sav                                134.88 MB  .sav
Sp√∏rreskjema_RVU_2021_2024.docx                                    0.20 MB  .docx

By extension:
  .docx             2 files
  .sav              4 files
  .zip              2 files

üìÅ ZONAL_REGISTER_DATA

Total files: 28

File listing:
-------------------------------------

## 2. Explore Data File Types

Understand the structure of different data formats (.dbf, .sav, .xlsx)

In [5]:
# Install required libraries for reading different formats
import subprocess
import sys

packages = ['pyreadstat', 'openpyxl', 'dbfread']

for package in packages:
    try:
        __import__(package)
        print(f"‚úì {package} already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
        print(f"‚úì {package} installed")

‚úì pyreadstat already installed
‚úì openpyxl already installed
‚úì dbfread already installed


In [21]:
# Try reading different file types
import pyreadstat
from dbfread import DBF
import warnings
warnings.filterwarnings('ignore')

file_schemas = {}

# 1. DBF files (zonal_register_data)
print("="*80)
print("üìä DBF FILES (zonal_register_data)")
print("="*80)

dbf_files = [f for f in all_files['zonal_register_data'] if f['extension'] == '.dbf']
print(f"\nTotal DBF files: {len(dbf_files)}")

for dbf_file in dbf_files:
    key = dbf_file['relative_path']
    sample_path = dbf_file['full_path']
    
    try:
        table = DBF(sample_path, encoding='latin-1')
        columns_detail = [
            {
                'name': field.name,
                'type': field.type,
                'length': field.length,
                'decimals': field.decimal_count
            }
            for field in table.fields
        ]
        
        rows = len(table)
        file_schemas[key] = {
            'file_type': 'DBF',
            'rows': rows,
            'columns': len(columns_detail),
            'columns_detail': columns_detail
        }
    except Exception as e:
        file_schemas[key] = {
            'file_type': 'DBF',
            'error': str(e)
        }

print("‚úì DBF schema extraction complete")

üìä DBF FILES (zonal_register_data)

Total DBF files: 26
‚úì DBF schema extraction complete


In [22]:
# 2. SPSS/SAV files (traveling_survey)
print("\n" + "="*80)
print("üìä SPSS SAV FILES (traveling_survey)")
print("="*80)

sav_files = [f for f in all_files['traveling_survey'] if f['extension'].lower() == '.sav']
print(f"\nTotal SAV files: {len(sav_files)}")

for sav_file in sav_files:
    key = sav_file['relative_path']
    try:
        try:
            df, meta = pyreadstat.read_sav(sav_file['full_path'], row_limit=100)
            rows = meta.number_rows if hasattr(meta, 'number_rows') else len(df)
        except TypeError:
            df, meta = pyreadstat.read_sav(sav_file['full_path'])
            rows = len(df)
        
        columns_detail = [
            {'name': col, 'type': str(df[col].dtype)}
            for col in df.columns
        ]
        
        file_schemas[key] = {
            'file_type': 'SAV (SPSS)',
            'rows': rows,
            'columns': len(df.columns),
            'columns_detail': columns_detail
        }
    except Exception as e:
        file_schemas[key] = {
            'file_type': 'SAV (SPSS)',
            'error': str(e)
        }

print("‚úì SAV schema extraction complete")


üìä SPSS SAV FILES (traveling_survey)

Total SAV files: 4
‚úì SAV schema extraction complete


In [23]:
# 3. XLSX files
print("\n" + "="*80)
print("üìä EXCEL FILES (.xlsx)")
print("="*80)

from openpyxl import load_workbook

xlsx_files = [f for f in all_files['zonal_register_data'] if f['extension'].lower() == '.xlsx']
print(f"\nTotal XLSX files: {len(xlsx_files)}")

for xlsx_file in xlsx_files:
    key = xlsx_file['relative_path']
    try:
        wb = load_workbook(xlsx_file['full_path'], read_only=True, data_only=True)
        sheets_info = []
        
        for sheet_name in wb.sheetnames:
            ws = wb[sheet_name]
            header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True), [])
            header = [str(h) if h is not None else '' for h in header_row]
            
            sheets_info.append({
                'sheet': sheet_name,
                'rows': ws.max_row,
                'columns': len(header),
                'column_names': header
            })
        
        file_schemas[key] = {
            'file_type': 'XLSX',
            'sheets': sheets_info
        }
    except Exception as e:
        file_schemas[key] = {
            'file_type': 'XLSX',
            'error': str(e)
        }

print("‚úì XLSX schema extraction complete")


üìä EXCEL FILES (.xlsx)

Total XLSX files: 2
‚úì XLSX schema extraction complete


In [24]:
# 4. ZIP and other archives
print("\n" + "="*80)
print("üì¶ ARCHIVES")
print("="*80)

zip_files = [f for f in all_files['traveling_survey'] if f['extension'].lower() == '.zip']
print(f"\nTotal ZIP files: {len(zip_files)}")

import zipfile
for zip_file in zip_files:
    print(f"\nüîç {zip_file['filename']} ({zip_file['size_mb']:.2f} MB)")
    try:
        with zipfile.ZipFile(zip_file['full_path'], 'r') as z:
            file_list = z.namelist()
            print(f"  Contains {len(file_list)} files:")
            for fname in sorted(file_list)[:10]:  # First 10
                info = z.getinfo(fname)
                size_mb = info.file_size / (1024*1024)
                print(f"    - {fname:<50} {size_mb:>8.2f} MB")
            if len(file_list) > 10:
                print(f"    ... and {len(file_list) - 10} more files")
    except Exception as e:
        print(f"  ‚ö†Ô∏è Could not read: {e}")


üì¶ ARCHIVES

Total ZIP files: 2

üîç Filemail.com - Nasjonal RVU akkumulert data.zip (228.44 MB)
  Contains 2 files:
    - RVU 2019-2024 Personfil Vektet 251125.sav             93.56 MB
    - RVU 2019_2024 Reisefil 251107.sav                    134.88 MB

üîç Filemail.com - RVU 2025.zip (128.01 MB)
  Contains 2 files:
    - Nasjonal_RVU_PERSON_Nov26_0901.sav                    63.65 MB
    - Nasjonal_RVU_REISER_Nov26_0901.sav                    64.36 MB


## 3. Generate Data Catalog Markdown

Create a comprehensive markdown document of all datasets.

In [25]:
# Generate comprehensive markdown catalog
catalog_md = f"""# RVX Data Catalog

**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Complete inventory and documentation of all datasets in the RVX folders.

## Overview

The RVX folder contains two main data sources:

1. **traveling_survey/** - National travel survey (RVU - Reisevaneunders√∏kelsen)
2. **zonal_register_data/** - Zonal statistical data (SDAT files) from Statistics Norway

## Folder Structure

"""

# Add folder summaries
for folder_name in folders:
    files = all_files[folder_name]
    total_size = sum(f['size_mb'] for f in files)
    
    catalog_md += f"\n### {folder_name}/\n\n"
    
    if folder_name == 'traveling_survey':
        catalog_md += """**National Travel Survey Data (RVU)**

Contains survey responses about travel behavior of Norwegian households.
- **Format:** SPSS (.sav), ZIP archives, documentation (.docx)
- **Source:** Statistics Norway (SSB)
- **Coverage:** Years 2019-2025

"""
    else:
        catalog_md += """**Zonal Statistical Data (SDAT)**

Grid-based statistical data at different geographic resolutions.
- **Format:** DBF (dBase), XLSX
- **Source:** Statistics Norway (TRAMOD/RVX)
- **Coverage:** Multiple grid resolutions (grunnkrets, delomr, etc.)
- **Data years:** 2020-2024

"""
    
    catalog_md += f"**Statistics:**\n"
    catalog_md += f"- Total files: {len(files)}\n"
    catalog_md += f"- Total size: {total_size:.2f} MB\n\n"
    
    # List files
    catalog_md += "**Files:**\n\n"
    catalog_md += "| Filename | Size (MB) | Type | Rows | Columns |\n"
    catalog_md += "|----------|-----------|------|------|---------|\n"
    
    for f in files:
        schema = file_schemas.get(f['relative_path'], {})
        rows = schema.get('rows', 'n/a')
        cols = schema.get('columns', 'n/a')
        catalog_md += f"| `{f['relative_path']}` | {f['size_mb']:.2f} | {f['extension'] or 'dir'} | {rows} | {cols} |\n"
    
    catalog_md += "\n"

# Add detailed schema section
catalog_md += "## Dataset Schemas\n\n"

for folder_name in folders:
    catalog_md += f"### {folder_name}/\n\n"
    files = all_files[folder_name]
    
    for f in files:
        key = f['relative_path']
        schema = file_schemas.get(key)
        if not schema:
            continue
        
        catalog_md += f"#### {f['filename']}\n\n"
        catalog_md += f"- **Path:** `{key}`\n"
        catalog_md += f"- **Type:** {schema.get('file_type', 'Unknown')}\n"
        
        if 'error' in schema:
            catalog_md += f"- **Error:** {schema['error']}\n\n"
            continue
        
        if schema.get('file_type') == 'XLSX':
            catalog_md += f"- **Sheets:** {len(schema.get('sheets', []))}\n\n"
            for sheet in schema.get('sheets', []):
                catalog_md += f"  - **Sheet:** {sheet['sheet']}\n"
                catalog_md += f"    - Rows: {sheet['rows']}\n"
                catalog_md += f"    - Columns: {sheet['columns']}\n"
                catalog_md += f"    - Column names: {', '.join(sheet['column_names'])}\n"
            catalog_md += "\n"
        else:
            catalog_md += f"- **Rows:** {schema.get('rows', 'n/a')}\n"
            catalog_md += f"- **Columns:** {schema.get('columns', 'n/a')}\n\n"
            
            columns_detail = schema.get('columns_detail', [])
            if columns_detail:
                catalog_md += "**Column details:**\n\n"
                catalog_md += "| Column | Type |\n"
                catalog_md += "|--------|------|\n"
                for col in columns_detail:
                    col_name = col.get('name', '')
                    col_type = col.get('type', '')
                    catalog_md += f"| {col_name} | {col_type} |\n"
                catalog_md += "\n"

print("Generated catalog (first 2000 chars):")
print(catalog_md[:2000])

Generated catalog (first 2000 chars):
# RVX Data Catalog

**Generated:** 2026-02-02 16:34:31

Complete inventory and documentation of all datasets in the RVX folders.

## Overview

The RVX folder contains two main data sources:

1. **traveling_survey/** - National travel survey (RVU - Reisevaneunders√∏kelsen)
2. **zonal_register_data/** - Zonal statistical data (SDAT files) from Statistics Norway

## Folder Structure


### traveling_survey/

**National Travel Survey Data (RVU)**

Contains survey responses about travel behavior of Norwegian households.
- **Format:** SPSS (.sav), ZIP archives, documentation (.docx)
- **Source:** Statistics Norway (SSB)
- **Coverage:** Years 2019-2025

**Statistics:**
- Total files: 8
- Total size: 713.47 MB

**Files:**

| Filename | Size (MB) | Type | Rows | Columns |
|----------|-----------|------|------|---------|
| `Filemail.com - Nasjonal RVU akkumulert data.zip` | 228.44 | .zip | n/a | n/a |
| `Filemail.com - RVU 2025.zip` | 128.01 | .zip | n/a | n/

In [26]:
# Save the catalog markdown
catalog_path = project_root / 'docs' / 'DATA_CATALOG_RVX.md'
catalog_path.parent.mkdir(parents=True, exist_ok=True)

with open(catalog_path, 'w') as f:
    f.write(catalog_md)

print(f"‚úì Saved catalog to: {catalog_path}")
print(f"\nFile size: {catalog_path.stat().st_size / 1024:.2f} KB")

‚úì Saved catalog to: /Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/docs/DATA_CATALOG_RVX.md

File size: 66.75 KB


## 4. Summary

Check what we've discovered

In [27]:
print("\n" + "="*80)
print("DATA CATALOG SUMMARY")
print("="*80)

for folder_name in folders:
    files = all_files[folder_name]
    total_size = sum(f['size_mb'] for f in files)
    
    print(f"\nüìÅ {folder_name}:")
    print(f"   Files: {len(files)}")
    print(f"   Total size: {total_size:.2f} MB")
    
    by_ext = defaultdict(int)
    for f in files:
        by_ext[f['extension']] += 1
    
    # Build type summary string
    type_summary = ', '.join(f"{ext or '[none]'}({c})" for ext, c in sorted(by_ext.items()))
    print(f"   Types: {type_summary}")

print(f"\n‚úÖ Full catalog saved to: {catalog_path}")


DATA CATALOG SUMMARY

üìÅ traveling_survey:
   Files: 8
   Total size: 713.47 MB
   Types: .docx(2), .sav(4), .zip(2)

üìÅ zonal_register_data:
   Files: 28
   Total size: 79.42 MB
   Types: .dbf(26), .xlsx(2)

‚úÖ Full catalog saved to: /Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/docs/DATA_CATALOG_RVX.md


## 5. Create Pretty Outputs

Convert the markdown catalog to HTML and PDF formats for viewing/printing.

In [31]:
# 1. Preview in notebook (renders the markdown beautifully)
from IPython.display import Markdown, display

print("üìñ Rendering markdown preview...")
# display(Markdown(catalog_md))

üìñ Rendering markdown preview...


In [29]:
# 2. Create styled HTML version (beautiful webpage)
# Convert markdown to HTML first
import markdown
html_content = markdown.markdown(catalog_md, extensions=['tables'])

# Create full HTML page with styling (using f-string to avoid format conflicts)
full_html = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>RVX Data Catalog</title>
    <style>
        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
            line-height: 1.6;
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f5f5f5;
        }}
        .container {{
            background-color: white;
            padding: 40px;
            border-radius: 8px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }}
        h1 {{
            color: #2c3e50;
            border-bottom: 3px solid #3498db;
            padding-bottom: 10px;
        }}
        h2 {{
            color: #34495e;
            margin-top: 30px;
            border-bottom: 2px solid #ecf0f1;
            padding-bottom: 8px;
        }}
        h3 {{
            color: #7f8c8d;
        }}
        table {{
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
            background-color: white;
        }}
        th {{
            background-color: #3498db;
            color: white;
            padding: 12px;
            text-align: left;
            font-weight: 600;
        }}
        td {{
            padding: 10px 12px;
            border-bottom: 1px solid #ecf0f1;
        }}
        tr:hover {{
            background-color: #f8f9fa;
        }}
        code {{
            background-color: #f4f4f4;
            padding: 2px 6px;
            border-radius: 3px;
            font-family: 'Courier New', monospace;
            font-size: 0.9em;
        }}
        ul {{
            padding-left: 25px;
        }}
        li {{
            margin: 8px 0;
        }}
        .generated-date {{
            color: #95a5a6;
            font-style: italic;
        }}
    </style>
</head>
<body>
    <div class="container">
        {html_content}
    </div>
</body>
</html>
"""

# Save HTML file
html_path = project_root / 'docs' / 'DATA_CATALOG_RVX.html'
with open(html_path, 'w', encoding='utf-8') as f:
    f.write(full_html)

print(f"‚úÖ Saved styled HTML to: {html_path}")
print(f"   Open in browser: file://{html_path}")
print(f"   File size: {html_path.stat().st_size / 1024:.2f} KB")

‚úÖ Saved styled HTML to: /Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/docs/DATA_CATALOG_RVX.html
   Open in browser: file:///Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/docs/DATA_CATALOG_RVX.html
   File size: 103.71 KB


In [30]:
# 3. Create PDF version (for printing)
print("\nüìÑ Creating PDF version...")

try:
    # Check if markdown package is available
    try:
        import markdown
        print("‚úì markdown package available")
    except ImportError:
        print("Installing markdown...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "markdown", "-q"])
        import markdown
        print("‚úì markdown installed")
    
    # Try using weasyprint (best option for HTML‚ÜíPDF)
    try:
        from weasyprint import HTML
        
        pdf_path = project_root / 'docs' / 'DATA_CATALOG_RVX.pdf'
        HTML(string=full_html).write_pdf(pdf_path)
        
        print(f"‚úÖ Saved PDF to: {pdf_path}")
        print(f"   File size: {pdf_path.stat().st_size / 1024:.2f} KB")
        print(f"   Ready to print!")
        
    except ImportError:
        print("‚ö†Ô∏è  WeasyPrint not available - trying alternative...")
        
        # Alternative: Use pdfkit (requires wkhtmltopdf installed)
        try:
            import pdfkit
            pdf_path = project_root / 'docs' / 'DATA_CATALOG_RVX.pdf'
            pdfkit.from_string(full_html, str(pdf_path))
            print(f"‚úÖ Saved PDF to: {pdf_path}")
        except:
            print("‚ö†Ô∏è  PDF generation requires additional tools.")
            print("   Install with: pip install weasyprint")
            print("   Or use: pip install pdfkit + install wkhtmltopdf")
            print("   For now, use the HTML version for printing (browser ‚Üí Print ‚Üí Save as PDF)")
            
except Exception as e:
    print(f"‚ö†Ô∏è  PDF creation skipped: {e}")
    print("   You can print the HTML version from your browser (Cmd+P ‚Üí Save as PDF)")


üìÑ Creating PDF version...
‚úì markdown package available
‚ö†Ô∏è  WeasyPrint not available - trying alternative...
‚ö†Ô∏è  PDF generation requires additional tools.
   Install with: pip install weasyprint
   Or use: pip install pdfkit + install wkhtmltopdf
   For now, use the HTML version for printing (browser ‚Üí Print ‚Üí Save as PDF)


In [32]:
# 4. Summary of outputs
print("\n" + "="*80)
print("üìö DATA CATALOG OUTPUTS")
print("="*80)

outputs = [
    ("Markdown", catalog_path, "Edit and version control"),
    ("HTML", html_path, "Open in browser, share online"),
]

# Check if PDF was created
pdf_path = project_root / 'docs' / 'DATA_CATALOG_RVX.pdf'
if pdf_path.exists():
    outputs.append(("PDF", pdf_path, "Print or share as document"))

for format_name, path, use_case in outputs:
    size_kb = path.stat().st_size / 1024
    print(f"\n{format_name}:")
    print(f"  üìç {path}")
    print(f"  üìä {size_kb:.2f} KB")
    print(f"  üí° {use_case}")

print("\n" + "="*80)
print("üéâ All done! You have a complete data catalog ready to use.")
print("="*80)


üìö DATA CATALOG OUTPUTS

Markdown:
  üìç /Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/docs/DATA_CATALOG_RVX.md
  üìä 66.82 KB
  üí° Edit and version control

HTML:
  üìç /Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/docs/DATA_CATALOG_RVX.html
  üìä 103.71 KB
  üí° Open in browser, share online

PDF:
  üìç /Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/docs/DATA_CATALOG_RVX.pdf
  üìä 156.26 KB
  üí° Print or share as document

üéâ All done! You have a complete data catalog ready to use.


## 6. Interactive Data Inspector

Browse through DBF files to verify schema and content.

In [33]:
# List all available DBF files
print("="*80)
print("üìã AVAILABLE DBF FILES")
print("="*80)
print("\nChange the file_index variable below to inspect different files:\n")

dbf_file_list = [f for f in all_files['zonal_register_data'] if f['extension'] == '.dbf']

for idx, f in enumerate(dbf_file_list):
    print(f"{idx:2d}. {f['filename']:<50} ({f['size_mb']:.2f} MB, {file_schemas.get(f['relative_path'], {}).get('rows', '?')} rows)")

print(f"\nTotal: {len(dbf_file_list)} DBF files")

üìã AVAILABLE DBF FILES

Change the file_index variable below to inspect different files:

 0. sdat1_d2024_g2020.dbf                              (4.42 MB, 14097 rows)
 1. sdat1_d2024_g2021.dbf                              (4.42 MB, 14097 rows)
 2. sdat1_d2024_g2023.dbf                              (4.43 MB, 14101 rows)
 3. sdat1_d2024_g2024.dbf                              (4.43 MB, 14101 rows)
 4. sdat3_d2023x_g2020.dbf                             (2.38 MB, 14097 rows)
 5. sdat3_d2023x_g2021.dbf                             (2.38 MB, 14097 rows)
 6. sdat3_d2023x_g2023.dbf                             (2.38 MB, 14101 rows)
 7. sdat3_d2023x_g2024.dbf                             (2.38 MB, 14101 rows)
 8. sdat4_d2024_g2020.dbf                              (2.60 MB, 14097 rows)
 9. sdat4_d2024_g2021.dbf                              (2.60 MB, 14097 rows)
10. sdat4_d2024_g2023.dbf                              (2.60 MB, 14101 rows)
11. sdat4_d2024_g2024.dbf                              (2.60 

In [56]:
# Change this number to inspect different files (0 to 25)
file_index = 1

# Load and display
selected = dbf_file_list[file_index]
print(selected)
table = DBF(selected['full_path'], encoding='latin-1')
df = pd.DataFrame(list(table))

print(f"{selected['filename']} ({selected['size_mb']:.2f} MB)\n")
print(f"{df.shape[0]} rows x {df.shape[1]} columns\n")

# Generic schema check: column names and DBF types (2-row table)
row_names = [field.name for field in table.fields]
row_types = [field.type for field in table.fields]

schema_check = pd.DataFrame([row_names, row_types], index=["Column", "DBF Type"])
print("Schema check (columns + DBF type):")
display(schema_check)

# Raw preview
df.head(6)

{'filename': 'sdat1_d2024_g2021.dbf', 'relative_path': 'sdat1_d2024_g2021.dbf', 'full_path': '/Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/data/raw/population/rvx/zonal_register_data/sdat1_d2024_g2021.dbf', 'size_bytes': 4639259, 'size_mb': 4.42, 'extension': '.dbf'}
sdat1_d2024_g2021.dbf (4.42 MB)

14097 rows x 41 columns

Schema check (columns + DBF type):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
Column,GRUNNKRETS,M_0_4,M_5_9,M_10_14,M_15_19,M_20_24,M_25_29,M_30_34,M_35_39,M_40_44,...,K_50_54,K_55_59,K_60_64,K_65_69,K_70_74,K_75_79,K_80_84,K_85_89,K_90_94,K_95_UP
DBF Type,N,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N


Unnamed: 0,GRUNNKRETS,M_0_4,M_5_9,M_10_14,M_15_19,M_20_24,M_25_29,M_30_34,M_35_39,M_40_44,...,K_50_54,K_55_59,K_60_64,K_65_69,K_70_74,K_75_79,K_80_84,K_85_89,K_90_94,K_95_UP
0,3010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3010102,3.0,0.0,0.0,0.0,15.0,39.0,27.0,29.0,15.0,...,4.0,3.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0
2,3010103,0.0,0.0,0.0,0.0,3.0,5.0,6.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3010104,3.0,0.0,0.0,3.0,45.0,78.0,42.0,29.0,12.0,...,3.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
4,3010105,4.0,3.0,0.0,4.0,26.0,66.0,55.0,44.0,20.0,...,9.0,12.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0
5,3010201,3.0,0.0,0.0,3.0,31.0,75.0,53.0,26.0,11.0,...,5.0,3.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
dbf_file_list


[{'filename': 'sdat1_d2024_g2020.dbf',
  'relative_path': 'sdat1_d2024_g2020.dbf',
  'full_path': '/Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/data/raw/population/rvx/zonal_register_data/sdat1_d2024_g2020.dbf',
  'size_bytes': 4639259,
  'size_mb': 4.42,
  'extension': '.dbf'},
 {'filename': 'sdat1_d2024_g2021.dbf',
  'relative_path': 'sdat1_d2024_g2021.dbf',
  'full_path': '/Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/data/raw/population/rvx/zonal_register_data/sdat1_d2024_g2021.dbf',
  'size_bytes': 4639259,
  'size_mb': 4.42,
  'extension': '.dbf'},
 {'filename': 'sdat1_d2024_g2023.dbf',
  'relative_path': 'sdat1_d2024_g2023.dbf',
  'full_path': '/Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/data/raw/population/rvx/zonal_register_data/sdat1_d2024_g2023.dbf',
  'size_bytes': 4640575,
  'size_mb': 4.43,
  'extension': '.dbf'},
 {'filename': 'sdat1_d2024_g2024.dbf',
  'relative_path': 'sdat1_d2024_g2024.dbf',
  'full_path':