In [0]:
from pyspark.sql import SparkSession
import pandas as pd
from clinical_codes import map, load_map
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass, field
from typing import Dict, List, Tuple, Any, Optional, Union
import random
from logger import get_logger

In [0]:
logger = get_logger()
spark = SparkSession.builder.getOrCreate()

In [0]:
@dataclass
class ClaimLine:
 """Represents a single claim line in Medicare claims data."""
 claim_id: str
 service_line_num: int
 procedure_code: str
 procedure_description: str
 modifiers: List[str]
 modifier_descriptions: List[str]
 diagnosis_codes: List[str]
 diagnosis_codes_descriptions: List[str]
 amount: float
 provider_specialty: str
 provider_specialty_description: str

 date_of_service: str
 place_of_service: str
 approval_status: str = "UNKNOWN"  # New field to track if claim was approved/denied
 revenue_code: str = None

 @property
 def is_ambulance_claim(self) -> bool:
     """Check if claim is ambulance claim."""
     ambulance_hcpcs = ["A0426", "A0427", "A0428", "A0429", "A0430", "A0431", 
                          "A0432", "A0433", "A0434", "A0380", "A0390", "A0435", "A0436",
                          "A0030", "A0040", "A0050", "A0320", "A0322", "A0324", 
                          "A0326", "A0328", "A0330"]
     return self.procedure_code in ambulance_hcpcs

 @property
 def has_valid_ambulance_revenue_code(self) -> bool:
     """Check if revenue code is valid for ambulance services."""
     valid_codes = ["0540", "0542", "0543", "0545", "0546", "0548"]
     invalid_codes = ["0541", "0544", "0547"]
        
     if self.revenue_code in invalid_codes:
         return False
        
     # For claims after Jan 1, 2001, only 0540 is valid
     if self.date_of_service > "2001-01-01" and self.revenue_code != "0540":
         return False
            
     return self.revenue_code in valid_codes
    
 @property
 def has_death_pronouncement_modifier(self) -> bool:
     """Check if this claim has the QL modifier for death pronouncement."""
     return "QL" in self.modifiers
 
 @property
 def has_anatomic_modifiers(self) -> bool:
     """Check if claim has anatomic modifiers (RT, LT)."""
     return any(mod in ["RT", "LT"] for mod in self.modifiers)
 
 @property
 def has_bypass_modifiers(self) -> bool:
     """Check if claim has bypass modifiers (GX, GY, GZ, RB)."""
     return any(mod in ["GX", "GY", "GZ", "RB"] for mod in self.modifiers)
 
 @property
 def has_kx_modifier(self) -> bool:
     """Check if claim has KX modifier which can override certain edits."""
     return "KX" in self.modifiers
 
 @property
 def has_59_modifier(self) -> bool:
     """Check if claim has modifier 59 which indicates distinct procedural service."""
     return "59" in self.modifiers
 
 def has_bypass_modifier_for_ncci(self) -> bool:
     """Check if claim has any modifier that can bypass NCCI edits."""
     # Common modifiers that can bypass NCCI edits
     # bypass_modifiers = ["59", "XE", "XP", "XS", "XU", "25", "91"]
     bypass_modifiers = []
     return any(mod in bypass_modifiers for mod in self.modifiers)
 
 def to_prompt_format(self) -> str:
     """Format claim data for inclusion in a prompt."""
     return f"""Claim ID: {self.claim_id}
Date of Service: {self.date_of_service}
Provider Type: {self.provider_specialty}
Procedure Codes: {self.procedure_code}
Modifiers: {', '.join(self.modifiers) if self.modifiers else 'None'}
Diagnosis Codes: {', '.join(self.diagnosis_codes)}
Place of Service: {self.place_of_service}
Amount: ${self.amount:.2f}"""


In [0]:
class MedicareClaimsDataset:
 """Handles loading and preprocessing of Medicare claims data."""
 
 def __init__(self, data_path: str):
     """Initialize the dataset with a path to the data file."""
     self.data_path = data_path
     self.claims = []
     
     self.diagnosis_mappings = load_map("DGNS.csv")
     self.provider_mappings = load_map("PRVDR.csv")
     self.hcpcs_mappings = load_map("HCPCS.csv")
     self.modifier_mappings = load_map("MODIFIERS.csv")
     self.load_data()
     
 
 def load_data(self):
    """Load claims data from various possible formats."""
    logger.info(f"Loading claims data from {self.data_path}")
    
    file_extension = Path(self.data_path).suffix.lower()
    
    try:
        if file_extension == '.csv':
            self._load_from_csv()
        elif file_extension == '.json':
            self._load_from_json()
        elif file_extension == '.parquet':
            self._load_from_parquet()
        elif file_extension in ['.xlsx', '.xls']:
            self._load_from_excel()
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")
        
        # Group claim lines by claim ID after loading
        self._associate_claim_lines()
            
        logger.info(f"Successfully loaded {len(self.claims)} claim lines")
    except Exception as e:
        logger.error(f"Error loading claims data: {str(e)}")
        raise
        
 def _associate_claim_lines(self):
     """Associate claim lines with the same claim ID with each other."""
     claims_by_id = {}
     for claim in self.claims:
         if claim.claim_id not in claims_by_id:
             claims_by_id[claim.claim_id] = []
         claims_by_id[claim.claim_id].append(claim)
        
     # Update each claim with its associated claim lines
     for claim in self.claims:
         claim.associated_claim_lines = [
             c for c in claims_by_id[claim.claim_id] if c != claim
         ]
    
     # Print statistics about associated claim lines
     claims_with_associates = sum(1 for c in self.claims if c.associated_claim_lines)
     avg_associates = sum(len(c.associated_claim_lines) for c in self.claims) / len(self.claims) if self.claims else 0
     print(f"Claims with associated lines: {claims_with_associates}")
     print(f"Average associated lines per claim: {avg_associates:.2f}")


 def load_ambulance_csv(self, file_path: str):
     """Load ambulance claims data with specific structure."""
     print(f"Loading ambulance data from {file_path}")
     df = pd.read_csv(file_path)
    
     # Group by claim ID
     claims_grouped = df.groupby('claim_id')
     print(f"Found {len(claims_grouped)} unique claims")
    
     for claim_id, group in claims_grouped:
         claim_lines = []
        
         for _, row in group.iterrows():
             # Extract procedure code (HCPCS)
             procedure_code = str(row['hcpcs']) if 'hcpcs' in row.index else ""
            
             # Skip if no procedure code
             if not procedure_code or pd.isna(procedure_code):
                 continue
                
             # Extract modifiers
             modifiers = []
             for mod_col in ['mod1', 'mod2', 'mod3', 'mod4']:
                 if mod_col in row.index and pd.notna(row[mod_col]) and row[mod_col]:
                     modifiers.append(str(row[mod_col]))
            
             # Create claim line
             claim_line = ClaimLine(
                 claim_id=str(claim_id),
                 service_line_num=int(row['line_num']) if 'line_num' in row.index else 0,
                 procedure_code=procedure_code,
                 procedure_description=map(procedure_code, self.hcpcs_mappings),
                 modifiers=modifiers,
                 modifiers_descriptions=map(modifiers, self.modifier_mappings),
                 diagnosis_codes=[],
                 diagnosis_codes_descriptions=map(diagnosis_codes, self.diagnosis_mappings),
                 amount=float(row['charge']) if 'charge' in row.index and pd.notna(row['charge']) else 0.0,
                 provider_specialty="Ambulance",
                 provider_specialty_description=map(provider_specialty, self.provider_mappings),
                 date_of_service=str(row['service_date']) if 'service_date' in row.index and pd.notna(row['service_date']) else "2025-01-01",
                 place_of_service="",
                 approval_status="APPROVED",  # Default for testing
                 revenue_code=str(row['rev_code']) if 'rev_code' in row.index and pd.notna(row['rev_code']) else None
             )
             claim_lines.append(claim_line)
            
         # Associate claim lines with each other
         for claim_line in claim_lines:
             claim_line.associated_claim_lines = [cl for cl in claim_lines if cl != claim_line]
             self.claims.append(claim_line)
        
     # Print statistics
     ambulance_codes = ["A0426", "A0427", "A0428", "A0429", "A0430", "A0431", "A0432", "A0433", "A0434"]
     mileage_codes = ["A0380", "A0390", "A0435", "A0436", "A0425"]
    
     transport_claims = sum(1 for c in self.claims if c.procedure_code in ambulance_codes)
     mileage_claims = sum(1 for c in self.claims if c.procedure_code in mileage_codes)
    
     print(f"Loaded {len(self.claims)} total claim lines")
     print(f"  - Transport code lines: {transport_claims}")
     print(f"  - Mileage code lines: {mileage_claims}")
     print(f"  - Claims with revenue codes: {sum(1 for c in self.claims if c.revenue_code is not None)}")

 def _load_from_csv(self):
    """Load claims from a CSV file with special handling for the FPS88 claims format."""
    print(f"Loading data from {self.data_path}")
    df = pd.read_csv(self.data_path)
    
    # Debug data structure
    print(f"CSV columns count: {len(df.columns)}")
    columns_sample = df.columns.tolist()[:10]  # First 10 column names
    print(f"First 10 columns: {columns_sample}")
    clmline_status = 'clmline_status'
    revenue_code_col = None
    for col in df.columns:
        if  'REV' in col.upper() or 'REVENUE' in col.upper() or 'REV_CD' in col.upper() or 'REVENUECD' in col.upper():
            revenue_code_col = col
            break
    
    # Find procedure code column
    proc_code_col = None
    for column_name in ['procCd', 'procedure_code', 'PRCDR_CD', 'proc_cd', 'hcpcs_cd']:
        if column_name in df.columns:
            proc_code_col = column_name
            break
    
    if not proc_code_col:
        # Look for L-codes in the dataset
        for col in df.columns:
            sample_values = df[col].astype(str).head(50).tolist()
            if any(str(val).startswith('L') for val in sample_values if pd.notna(val) and str(val) != 'nan'):
                proc_code_col = col
                print(f"Found procedure code column by L-code detection: {proc_code_col}")
                break
    
    if not proc_code_col and len(df.columns) > 51:
        proc_code_col = df.columns[51]  # Try position 51 as fallback
        print(f"Using fallback procedure code column: {proc_code_col}")
    
    if not proc_code_col:
        print("ERROR: Could not identify procedure code column!")
        return
    
    # Find modifier columns
    modifier_cols = []
    for name in ['procModifier1', 'procModifier2', 'procModifier3', 'procModifier4', 'procModifier5',
                'PRCDR_1_MDFR_TXT', 'PRCDR_2_MDFR_TXT', 'PRCDR_3_MDFR_TXT', 'PRCDR_4_MDFR_TXT', 'PRCDR_5_MDFR_TXT']:
        if name in df.columns:
            modifier_cols.append(name)
    
    if not modifier_cols:
        # Try to find modifier columns by pattern
        for col in df.columns:
            if 'MDFR' in col or 'modifier' in col.lower() or 'MOD' in col:
                modifier_cols.append(col)
    
    print(f"Using procedure code column: {proc_code_col}")
    print(f"Using modifier columns: {modifier_cols}")
    
    # Find diagnosis code columns
    diag_cols = []
    for name in ['diagCd1', 'diagCd2', 'diagCd3', 'DGNS_1_CD', 'DGNS_2_CD', 'DGNS_3_CD']:
        if name in df.columns:
            diag_cols.append(name)
    
    if not diag_cols:
        # Try to find diagnosis columns by pattern
        for col in df.columns:
            if 'DGNS' in col or 'diag' in col.lower():
                diag_cols.append(col)
    
    # Find status column (usually the last column)
    status_col = None
    last_col = df.columns[-1]
    sample_values = df[last_col].dropna().astype(str).head(10).tolist()
    if any(val in ['APPROVED', 'DENIED'] for val in sample_values):
        status_col = last_col
        print(f"Found approval status column: {status_col}")
    
    # Process each row into a ClaimLine object
    for idx, row in df.iterrows():
        try:
            # Get claim ID
            claim_id = None
            for id_col in ['clm_id', 'claimId', 'ENHCNCMT_ID', 'claim_id']:
                if id_col in row.index and pd.notna(row[id_col]):
                    claim_id = str(row[id_col])
                    break
            if not claim_id:
                claim_id = f"CLAIM_{idx}"
            
            # Get service line number
            service_line_num = idx + 1  # Default
            for line_col in ['serviceLineNum', 'SRVC_LINE_NUM', 'service_line_num']:
                if line_col in row.index and pd.notna(row[line_col]):
                    service_line_num = int(row[line_col])
                    break
            
            # Get procedure code
            procedure_code = str(row[proc_code_col]) if pd.notna(row[proc_code_col]) else ""
            
            # Get modifiers from all modifier columns
            modifiers = []
            for col in modifier_cols:
                if col in row.index and pd.notna(row[col]) and row[col]:
                    modifiers.append(str(row[col]))
            
            # Get diagnosis codes
            diagnosis_codes = []
            for col in diag_cols:
                if col in row.index and pd.notna(row[col]) and row[col]:
                    diagnosis_codes.append(str(row[col]))
            
            # Get amount
            amount = 0.0
            for amt_col in ['ALOWD_CHRG_AMT', 'amount', 'allowChgAmt']:
                if amt_col in row.index and pd.notna(row[amt_col]):
                    amount = float(row[amt_col])
                    break
            
            # Get revenue code
            revenue_code_value = None
            if revenue_code_col and revenue_code_col in row.index:
                revenue_code_value = str(row[revenue_code_col]) if pd.notna(row[revenue_code_col]) else None
            
            # Get provider specialty
            provider_specialty = "Unknown"
            for spec_col in ['provSpecCd', 'provider_specialty', 'PRVDR_SPCLTY_CD']:
                if spec_col in row.index and pd.notna(row[spec_col]):
                    provider_specialty = str(row[spec_col])
                    break
            
            # Get date of service
            date_of_service = None
            for date_col in ['enhancementTime', 'date_of_service', 'serviceFromDt', 'ENHNCMT_TIME']:
                if date_col in row.index and pd.notna(row[date_col]):
                    date_str = str(row[date_col])
                    # Handle timestamp format
                    if 'T' in date_str:
                        date_of_service = date_str.split('T')[0]
                    else:
                        date_of_service = date_str
                    break
            
            if date_of_service is None:
                date_of_service = datetime.now().strftime("%Y-%m-%d")
            
            # Get place of service
            place_of_service = "11"  # Default
            for pos_col in ['placeServCd', 'place_of_service', 'PLC_OD_SRVC_CD']:
                if pos_col in row.index and pd.notna(row[pos_col]):
                    place_of_service = str(row[pos_col])
                    break
            
            # Get approval status
            approval_status = "UNKNOWN"
            if status_col and pd.notna(row[status_col]):
                status_value = str(row[status_col]).strip().upper()
                if status_value in ["APPROVED", "DENIED"]:
                    approval_status = status_value
            
            # Create ClaimLine object
            claim_line = ClaimLine(
                claim_id=claim_id,
                service_line_num=service_line_num,
                procedure_code=procedure_code,
                procedure_description=map(procedure_code, self.hcpcs_mappings),
                modifiers=modifiers,
                modifier_descriptions=map(modifiers, self.modifier_mappings),
                diagnosis_codes=diagnosis_codes,
                diagnosis_codes_descriptions=map(diagnosis_codes, self.diagnosis_mappings),
                amount=amount,
                provider_specialty=provider_specialty,
                provider_specialty_description=map(provider_specialty, self.provider_mappings),
                date_of_service=date_of_service,
                place_of_service=place_of_service,
                approval_status=approval_status,
                revenue_code=revenue_code_value
            )
            self.claims.append(claim_line)
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
    
    print(f"Successfully loaded {len(self.claims)} claim lines")
    
    # Analyze the loaded data
    procedures = set(c.procedure_code for c in self.claims)
    l_codes = [c.procedure_code for c in self.claims if c.procedure_code.startswith('L')]
    approved_claims = sum(1 for c in self.claims if c.approval_status == "APPROVED")
    denied_claims = sum(1 for c in self.claims if c.approval_status == "DENIED")
    rt_lt_claims = sum(1 for c in self.claims if c.has_anatomic_modifiers)
    bypass_claims = sum(1 for c in self.claims if c.has_bypass_modifiers)
    ncci_bypass_claims = sum(1 for c in self.claims if hasattr(c, 'has_bypass_modifier_for_ncci') and callable(getattr(c, 'has_bypass_modifier_for_ncci')) and c.has_bypass_modifier_for_ncci())
    
    print(f"Unique procedure codes: {len(procedures)}")
    print(f"L-codes found: {len(set(l_codes))} unique codes")
    if l_codes:
        print(f"Sample L-codes: {sorted(list(set(l_codes)))[:5]}")
    print(f"Approved claims: {approved_claims}, Denied claims: {denied_claims}")
    print(f"Claims with RT/LT modifiers: {rt_lt_claims}")
    print(f"Claims with standard bypass modifiers (GX, GY, GZ, RB): {bypass_claims}")
    print(f"Claims with NCCI bypass modifiers (59, XE, etc.): {ncci_bypass_claims}")

 def _load_from_json(self):
     """Load claims from a JSON file."""
     df = pd.read_json(self.data_path)
     self._process_dataframe(df)
 
 def _load_from_parquet(self):
     """Load claims from a Parquet file."""
     df = pd.read_parquet(self.data_path)
     self._process_dataframe(df)
 
 def _load_from_excel(self):
     """Load claims from an Excel file."""
     df = pd.read_excel(self.data_path)
     self._process_dataframe(df)
 
 def _process_dataframe(self, df: pd.DataFrame):
     """Process a DataFrame into ClaimLine objects."""
     # Map DataFrame columns to ClaimLine attributes
     # Handle variations in column naming
     column_mappings = {
         'claim_id': ['claim_id', 'clm_id', 'clm_ID', 'ENHCNCMT_ID'],
         'service_line_num': ['service_line_num', 'line_num', 'SRVC_LINE_NUM'],
         'procedure_code': ['procedure_code', 'hcpcs_cd', 'hipps_cd', 'cpt_code', 'PRCDR_CD'],
         'modifiers': ['modifiers', 'mod', 'modifier', 'PRCDR_1_MDFR_TXT', 'PRCDR_2_MDFR_TXT', 'PRCDR_3_MDFR_TXT'],
         'diagnosis_codes': ['diagnosis_codes', 'diag_cd', 'icd_code'],
         'amount': ['amount', 'charge_amt', 'ALOWD_CHRG_AMT'],
         'provider_specialty': ['provider_specialty', 'prvdr_spclty'],
         'date_of_service': ['date_of_service', 'srvc_dt', 'ENHNCMT_TIME'],
         'place_of_service': ['place_of_service', 'pos_cd'],
         'approval_status': ['approval_status', 'status', 'claim_status']
     }
     # Standardize column names
     standardized_columns = {}
     for standard_name, variations in column_mappings.items():
         for variation in variations:
             if variation in df.columns:
                 standardized_columns[variation] = standard_name
                 break
     
     df = df.rename(columns=standardized_columns)
     
     # Fill missing columns with default values
     required_columns = ['claim_id', 'service_line_num', 'procedure_code']
     for col in required_columns:
         if col not in df.columns:
             raise ValueError(f"Required column '{col}' not found in data")
     
     # Handle optional columns
     if 'modifiers' not in df.columns:
         df['modifiers'] = None
     if 'diagnosis_codes' not in df.columns:
         df['diagnosis_codes'] = None
     if 'amount' not in df.columns:
         df['amount'] = 0.0
     if 'provider_specialty' not in df.columns:
         df['provider_specialty'] = "Unknown"
     if 'date_of_service' not in df.columns:
         df['date_of_service'] = datetime.now().strftime("%Y-%m-%d")
     if 'place_of_service' not in df.columns:
         df['place_of_service'] = "11"  # Office
     if 'approval_status' not in df.columns:
         df['approval_status'] = "UNKNOWN"
     
     # Process each row into a ClaimLine object
     for _, row in df.iterrows():
         # Handle different formats of modifiers
         if pd.isna(row['modifiers']) or row['modifiers'] is None:
             modifiers = []
         elif isinstance(row['modifiers'], str):
             if ',' in row['modifiers']:
                 modifiers = [m.strip() for m in row['modifiers'].split(',')]
             else:
                 modifiers = [row['modifiers'].strip()]
         else:
             modifiers = [str(row['modifiers'])]
         
         # Handle different formats of diagnosis codes
         if pd.isna(row['diagnosis_codes']) or row['diagnosis_codes'] is None:
             diagnosis_codes = []
         elif isinstance(row['diagnosis_codes'], str):
             if ',' in row['diagnosis_codes']:
                 diagnosis_codes = [d.strip() for d in row['diagnosis_codes'].split(',')]
             else:
                 diagnosis_codes = [row['diagnosis_codes'].strip()]
         else:
             diagnosis_codes = [str(row['diagnosis_codes'])]
         
         # Get approval status
         if pd.isna(row['approval_status']) or row['approval_status'] is None:
             approval_status = "UNKNOWN"
         else:
             status = str(row['approval_status']).upper().strip()
             if status in ["APPROVED", "DENIED"]:
                 approval_status = status
             else:
                 approval_status = "UNKNOWN"
         
         claim_line = ClaimLine(
             claim_id=str(row['claim_id']),
             service_line_num=int(row['service_line_num']),
             procedure_code=str(row['procedure_code']),
             procedure_description=List(str),
             modifiers=modifiers,
             modifier_descriptions=List(str),
             diagnosis_codes=diagnosis_codes,
             diagnosis_mappings=List(str),
             amount=float(row['amount']),
             provider_specialty=str(row['provider_specialty']),
             provider_specialty_description=List(str),
             date_of_service=str(row['date_of_service']),
             place_of_service=str(row['place_of_service']),
             approval_status=approval_status
         )
         self.claims.append(claim_line)
 
 def filter_claims(self, **kwargs) -> List[ClaimLine]:
     """Filter claims based on criteria."""
     filtered_claims = self.claims
     
     for key, value in kwargs.items():
         if hasattr(ClaimLine, key):
             filtered_claims = [claim for claim in filtered_claims 
                              if getattr(claim, key) == value]
     
     return filtered_claims
 
 def get_orthotic_claims(self) -> List[ClaimLine]:
     """Get claims related to orthotics (L-codes)."""
     return [claim for claim in self.claims
             if claim.procedure_code.startswith('L')]
 
 def create_evaluation_dataset(self, sample_size: int = 100) -> List[ClaimLine]:
     """Create a balanced evaluation dataset."""
     # For orthotic claims, use a mix of approved and denied claims
     orthotic_claims = self.get_orthotic_claims()
     
     if not orthotic_claims:
         logger.warning("No orthotic claims found in dataset")
         return []
     
     # Ensure we don't sample more than available
     sample_size = min(sample_size, len(orthotic_claims))
     
     # Create a balanced sample with both approved and denied claims if available
     approved_claims = [c for c in orthotic_claims if c.approval_status == "APPROVED"]
     denied_claims = [c for c in orthotic_claims if c.approval_status == "DENIED"]
     
     print(f"Available for sampling: {len(approved_claims)} approved, {len(denied_claims)} denied")
     
     # If we have both types, create a balanced sample
     if approved_claims and denied_claims:
         # Determine how many of each to sample
         approved_sample_size = min(sample_size // 2, len(approved_claims))
         denied_sample_size = min(sample_size - approved_sample_size, len(denied_claims))
         
         # Adjust approved sample if we couldn't get enough denied samples
         if denied_sample_size < (sample_size // 2):
             approved_sample_size = min(sample_size - denied_sample_size, len(approved_claims))
         
         # Sample claims
         sampled_approved = random.sample(approved_claims, approved_sample_size)
         sampled_denied = random.sample(denied_claims, denied_sample_size)
         
         # Combine samples
         sampled_claims = sampled_approved + sampled_denied
         random.shuffle(sampled_claims)
     else:
         # If we only have one type, sample from what's available
         available_claims = approved_claims or denied_claims
         sampled_claims = random.sample(available_claims, sample_size)
     
     print(f"Created evaluation dataset with {len(sampled_claims)} claims")
     print(f"  - Approved: {sum(1 for c in sampled_claims if c.approval_status == 'APPROVED')}")
     print(f"  - Denied: {sum(1 for c in sampled_claims if c.approval_status == 'DENIED')}")
     
     return sampled_claims


In [0]:
ambulance_dataset = pd.DataFrame(MedicareClaimsDataset('/Workspace/Users/benjamin.wynn@peraton.com/GlobalEditsModel/testing/test_lines/fps17_labeled_clmlines.csv').claims)
orthotics_dataset = pd.DataFrame(MedicareClaimsDataset('/Workspace/Users/benjamin.wynn@peraton.com/GlobalEditsModel/testing/test_lines/fps88_labeled_clmlines.csv').claims)

In [0]:

with open('/Workspace/Users/benjamin.wynn@peraton.com/GlobalEditsModel/testing/test_policy/fps88_policy.md', 'r', encoding='utf-8') as file:
    markdown_text = file.read()

print(markdown_text)
orthotics_dataset["policy"] = markdown_text
orthotics_dataset["edit"] = 88
with open('/Workspace/Users/benjamin.wynn@peraton.com/GlobalEditsModel/testing/test_policy/fps17_policy.md', 'r', encoding='utf-8') as file:
    markdown_text = file.read()

print(markdown_text)
ambulance_dataset["policy"] = markdown_text
ambulance_dataset["edit"] = 17

In [0]:
combined = pd.concat([ambulance_dataset, orthotics_dataset])
combined.to_csv('/Workspace/Users/benjamin.wynn@peraton.com/GlobalEditsModel/testing/testing_data.csv')
combined.head()

In [0]:
sum(combined['revenue_code'] != "None") / combined.shape[0]

In [0]:
rev_val = combined.iloc[0]['revenue_code']
sum(combined['revenue_code'].notnull()) / combined.shape[0]

In [0]:
print(type(combined['revenue_code'].values[0]))