In [1]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

sns.set(style='whitegrid')

In [2]:
use_cols = ['establishment_id','NEW_INCIDENT_LOCATION','NEW_INCIDENT_DESCRIPTION',"NEW_NAR_BEFORE_INCIDENT",
            "NEW_NAR_WHAT_HAPPENED","NEW_NAR_INJURY_ILLNESS", "NEW_NAR_OBJECT_SUBSTANCE"]
df = pd.read_csv('ITA Case Detail Data.csv', usecols=use_cols, encoding='latin1')

# Display the first few rows
df.head()

Unnamed: 0,establishment_id,NEW_NAR_WHAT_HAPPENED,NEW_NAR_BEFORE_INCIDENT,NEW_INCIDENT_LOCATION,NEW_NAR_INJURY_ILLNESS,NEW_NAR_OBJECT_SUBSTANCE,NEW_INCIDENT_DESCRIPTION
0,41940,needle stick to left thumb after administering...,getting medication ready to administer to resi...,Resident room,needle stick,needle,left thumb needle stick after giving medicatio...
1,41940,While attempting to give care for resident emp...,Giving care to the resident,Resident room,Left shoulder strain,resident,left should strain from trying to turn resident
2,41940,While reaching employee tried to brace herself...,Trying to reach to put her schedule on the table,rehab office,right wrist and shoulder strain left knee pain,the chair,right shoulder and arm strain and left knee pa...
3,41940,Employee entered shower room to wash hands and...,entered shower room to wash hands,shower room,Lower back pain left side numbness,wet floor,strain of lumbar region and neck muscle
4,41940,Administered insulin injection to resident upo...,Getting ready to administer insulin to resident,Resident room,Needle stick,needle,left second finger needle stick


In [7]:
# Import necessary NLP libraries
import re
from collections import Counter

# Function to identify gas-related injuries based on text description
def identify_gas_incidents(df):
    # List of gas-related keywords (can be expanded)
    gas_keywords = [
        'gas', 'carbon monoxide', 'co2', 'methane', 'propane', 'natural gas', 
        'hydrogen', 'oxygen', 'nitrogen', 'ammonia', 'chlorine', 'hydrogen sulfide',
        'h2s', 'fumes', 'toxic vapor', 'chemical vapor', 'benzene', 'vapor', 
        'co poisoning', 'sulphur dioxide', 'so2', 'combustible', 'flammable gas'
    ]
    
    # Create a regex pattern for case-insensitive matching of gas terms
    pattern = re.compile(r'\b(' + '|'.join(gas_keywords) + r')\b', re.IGNORECASE)
    
    # Gas-related illness/injury patterns - these indicate actual health effects
    illness_patterns = [
        r'expos\w+ to', r'inhal\w+', r'breath\w+', r'inhaled', 
        r'overcome by', r'affected by', r'sick from', r'ill from', 
        r'irritat\w+ by', r'reaction to', r'burn\w+ by', 
        r'headache from', r'dizz\w+', r'unconscious', r'asphyxiat\w+',
        r'poison\w+', r'nausea', r'vomit\w+', r'respiratory', r'breathing difficulty',
        r'breathing problem', r'chest pain', r'lung', r'passed out', r'faint\w+',
        r'suffocate', r'chok\w+', r'cough\w+', r'sick\w+'
    ]
    
    illness_pattern = re.compile(r'\b(' + '|'.join(illness_patterns) + r')\b', re.IGNORECASE)
    
    # Causal connection patterns that link gas and illness
    causal_patterns = [
        r'due to', r'caused by', r'because of', r'from', r'result of', 
        r'exposed to', r'after', r'following'
    ]
    
    causal_pattern = re.compile(r'\b(' + '|'.join(causal_patterns) + r')\b', re.IGNORECASE)
    
    # Non-injury gas incidents like leaks
    gas_incident_patterns = [
        r'leak\w*', r'gas leak', r'explosion'
    ]
    
    gas_incident_pattern = re.compile(r'\b(' + '|'.join(gas_incident_patterns) + r')\b', re.IGNORECASE)
    
    # Function to check if text indicates a gas-related illness/injury
    def is_gas_illness(text):
        if pd.isna(text):
            return False
        text = str(text)
        
        # Exclude COVID-19 cases that might be detected due to "CO"
        if re.search(r'\bcovid|\bcorona|\bsars-cov', text, re.IGNORECASE):
            return False
        
        # Check if gas term exists
        gas_match = pattern.search(text)
        if not gas_match:
            return False
            
        # Check for exclusion patterns - actions not resulting in gas exposure
        exclusion_patterns = [
            r'carrying', r'lift\w+', r'mov\w+', r'handl\w+', r'strain', 
            r'sprain', r'muscle', r'filling', r'transport\w+', r'empty',
            r'cylinder', r'tank', r'container', r'bottle', r'canister',
            r'heavy', r'weight', r'dropped', r'fell', r'loading'
        ]
        exclusion_pattern = re.compile(r'\b(' + '|'.join(exclusion_patterns) + r')\b', re.IGNORECASE)
        
        # If exclusion pattern is present and no illness indicator, exclude
        if exclusion_pattern.search(text) and not illness_pattern.search(text):
            return False
        
        # Check for gas leaks or explosions (which are gas incidents even without explicit mention of illness)
        if gas_incident_pattern.search(text):
            return True
            
        # Check for illness patterns
        if illness_pattern.search(text):
            # Look for causal connection between gas and illness
            # If there's both gas and illness mentioned, we'll assume there's a connection
            return True
            
        return False
    
    # Apply the refined check to relevant columns
    columns_to_check = ['NEW_NAR_OBJECT_SUBSTANCE', 'NEW_INCIDENT_DESCRIPTION', 
                       'NEW_NAR_WHAT_HAPPENED', 'NEW_NAR_INJURY_ILLNESS']
    is_gas_related = df.apply(
        lambda row: any(is_gas_illness(row[col]) for col in columns_to_check if pd.notna(row[col])), 
        axis=1
    )
    
    # Return the subset of gas-related incidents
    gas_incidents = df[is_gas_related].copy()
    
    return gas_incidents

gas_incidents_nlp = identify_gas_incidents(df)

In [8]:
# Save gas-related incidents to a new CSV file
print("Total number of incidents: ", len(df))
print(f"Total gas-related incidents identified via NLP: {len(gas_incidents_nlp)}")
print(f"Percentage of incidents that are gas-related: {(len(gas_incidents_nlp) / len(df) * 100):.2f}%")

output_filename = 'gas_related_incidents.csv'
gas_incidents_nlp.to_csv(output_filename, index=False)
print(f"\nGas-related incidents saved to '{output_filename}'")

# Display a few examples of gas-related incidents
print("\nExample gas-related incidents:")
sample_cols = ['NEW_NAR_WHAT_HAPPENED', 'NEW_NAR_OBJECT_SUBSTANCE']
display(gas_incidents_nlp[sample_cols].sample(5))

Total number of incidents:  890934
Total gas-related incidents identified via NLP: 1170
Percentage of incidents that are gas-related: 0.13%

Gas-related incidents saved to 'gas_related_incidents.csv'

Example gas-related incidents:


Unnamed: 0,NEW_NAR_WHAT_HAPPENED,NEW_NAR_OBJECT_SUBSTANCE
325878,The AA was processing packages at the dock fla...,Debris Other
129817,Cleaning up a spill of UN3352 a Pyrethroid Pes...,UN3352 a Pyrethroid Pesticide Liquid.
139457,Inhalation Exposure Head Freight 3 chemical co...,Freight 3 chemical compound gas liquid
393135,There was an ammonia leak and the smell bother...,N A
351616,[REDACTED] stated feeling dizzy and chest pain...,Propane
