In [4]:
from dbfread import DBF
from pathlib import Path

# Path to the raw DBF file
data_path = Path(
    "/Users/anderskielland/Documents/Synthetic data/code/synthetic-lab/"
    "data/raw/population/tramod/sdat3_d2023x_g2020.dbf"
)

# Load DBF (no records loaded yet)
table = DBF(data_path, load=False)

# --- Metadata ---
metadata = {
    "file_name": data_path.name,
    "num_records": table.header.numrecords,
    "encoding": table.encoding,
    "fields": [
        {
            "name": field.name,
            "type": field.type,
            "length": field.length,
            "decimal_count": field.decimal_count,
        }
        for field in table.fields
    ],
}

metadata


{'file_name': 'sdat3_d2023x_g2020.dbf',
 'num_records': 14097,
 'encoding': 'ascii',
 'fields': [{'name': 'GRUNNKRETS',
   'type': 'N',
   'length': 8,
   'decimal_count': 0},
  {'name': 'SYBLU', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYBMU', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYBHU', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYB', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYALU', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYAMU', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYAHU', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYA1524', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYA2534', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYA3554', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYA5566', 'type': 'N', 'length': 10, 'decimal_count': 4},
  {'name': 'SYA67UP', 'type': 'N', 'length': 10, 'decimal_count': 4},


In [5]:
import pandas as pd

# Load the DBF file into a pandas DataFrame
df = pd.DataFrame(iter(DBF(data_path)))

# Analyze INNT_IDX field
print("=== INNT_IDX Field Analysis ===\n")
print(f"Data type: {df['INNT_IDX'].dtype}")
print(f"Total records: {len(df)}")
print(f"Unique values: {df['INNT_IDX'].nunique()}")
print(f"Missing values: {df['INNT_IDX'].isna().sum()}\n")

print("Value counts:")
print(df['INNT_IDX'].value_counts().head(20))

print("\nBasic statistics:")
print(df['INNT_IDX'].describe())


=== INNT_IDX Field Analysis ===

Data type: float64
Total records: 14097
Unique values: 72
Missing values: 0

Value counts:
INNT_IDX
0.91    773
0.96    740
0.97    734
0.94    660
0.93    658
0.95    625
0.92    583
0.90    567
0.98    545
0.99    540
0.86    500
0.88    474
1.03    468
0.89    454
0.87    439
1.00    413
1.01    375
1.02    323
1.04    309
0.85    284
Name: count, dtype: int64

Basic statistics:
count    14097.000000
mean         0.970969
std          0.106760
min          0.000000
25%          0.900000
50%          0.960000
75%          1.020000
max          1.500000
Name: INNT_IDX, dtype: float64


In [6]:
# Primary statistics for INNT_IDX
innt_idx_stats = {
    "Count": df['INNT_IDX'].count(),
    "Min": df['INNT_IDX'].min(),
    "Max": df['INNT_IDX'].max(),
    "Mean": df['INNT_IDX'].mean(),
    "Median": df['INNT_IDX'].median(),
    "Std Dev": df['INNT_IDX'].std(),
    "25th Percentile": df['INNT_IDX'].quantile(0.25),
    "75th Percentile": df['INNT_IDX'].quantile(0.75),
}

print("=== Primary Statistics for INNT_IDX ===\n")
for key, value in innt_idx_stats.items():
    print(f"{key:20} {value:>15,.2f}")


=== Primary Statistics for INNT_IDX ===

Count                      14,097.00
Min                             0.00
Max                             1.50
Mean                            0.97
Median                          0.96
Std Dev                         0.11
25th Percentile                 0.90
75th Percentile                 1.02
