Import libraries

In [1]:
!pip install faker



In [2]:
import os
import json
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, time
import random
import re

fake = Faker()

In [3]:
# Define the folder path
folder_path = r"C:\Users\yomid\OneDrive - Case Western Reserve University\Desktop\KaggleX\data_trial"

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

In [4]:
# Define equipment types and their characteristics (as before)
equipment_types = {
    'Bar Screen': {'lifespan': 15, 'maintenance_interval': 30, 'failure_rate': 0.005},
    'Grit Chamber': {'lifespan': 20, 'maintenance_interval': 60, 'failure_rate': 0.003},
    'Primary Clarifier': {'lifespan': 30, 'maintenance_interval': 90, 'failure_rate': 0.002},
    'Aeration Tank': {'lifespan': 25, 'maintenance_interval': 45, 'failure_rate': 0.004},
    'Secondary Clarifier': {'lifespan': 30, 'maintenance_interval': 90, 'failure_rate': 0.002},
    'Activated Sludge System': {'lifespan': 20, 'maintenance_interval': 30, 'failure_rate': 0.006},
    'Trickling Filter': {'lifespan': 25, 'maintenance_interval': 60, 'failure_rate': 0.003},
    'Membrane Bioreactor': {'lifespan': 15, 'maintenance_interval': 30, 'failure_rate': 0.007},
    'Sand Filter': {'lifespan': 20, 'maintenance_interval': 45, 'failure_rate': 0.004},
    'UV Disinfection Unit': {'lifespan': 10, 'maintenance_interval': 30, 'failure_rate': 0.005},
    'Chemical Treatment System': {'lifespan': 15, 'maintenance_interval': 30, 'failure_rate': 0.004},
    'Sludge Thickener': {'lifespan': 20, 'maintenance_interval': 60, 'failure_rate': 0.003},
    'Anaerobic Digester': {'lifespan': 25, 'maintenance_interval': 90, 'failure_rate': 0.002},
    'Centrifuge': {'lifespan': 15, 'maintenance_interval': 45, 'failure_rate': 0.005},
    'Reverse Osmosis System': {'lifespan': 10, 'maintenance_interval': 30, 'failure_rate': 0.006},
    'Activated Carbon Filter': {'lifespan': 5, 'maintenance_interval': 60, 'failure_rate': 0.003},
    'Lift Station': {'lifespan': 25, 'maintenance_interval': 60, 'failure_rate': 0.004},
}

In [5]:
# equipment_parts
equipment_parts = {
    'Bar Screen': ['Motor', 'Screen bars', 'Scraper arms', 'Chain drive', 'Bearings', 'Gearbox'],
    'Grit Chamber': ['Grit pump', 'Air diffusers', 'Screw conveyor', 'Valves', 'Bearings', 'Piping'],
    'Primary Clarifier': ['Sludge scraper mechanism', 'Drive motor', 'Gearbox', 'Skimmer arm', 'Chain and sprockets', 'Bearings'],
    'Aeration Tank': ['Diffusers', 'Blowers', 'Valves', 'Air piping', 'Motors and fans', 'Membranes'],
    'Secondary Clarifier': ['Sludge removal mechanisms', 'Drive motors', 'Skimmer blades', 'Gearbox', 'Bearings', 'Chains and sprockets'],
    'Activated Sludge System': ['Aeration diffusers', 'Blowers', 'Pumps', 'Valves', 'Sensors', 'Motors'],
    'Trickling Filter': ['Distribution arms', 'Filter media', 'Spray nozzles', 'Pumps', 'Valves', 'Bearings'],
    'Membrane Bioreactor': ['Membranes', 'Permeate pump', 'Valves', 'Air diffusers', 'Blowers', 'Membrane cleaning system parts'],
    'Sand Filter': ['Filter media', 'Backwash pumps', 'Valves', 'Piping', 'Flow meters', 'Sensors'],
    'UV Disinfection Unit': ['UV lamps', 'Quartz sleeves', 'Ballasts', 'Power supplies', 'Sensors', 'Wipers'],
    'Chemical Treatment System': ['Chemical dosing pumps', 'Valves', 'Mixing tanks', 'Sensors', 'Injection nozzles', 'Tubing and piping'],
    'Sludge Thickener': ['Drive motors', 'Scrapers', 'Bearings', 'Gearboxes', 'Pumps', 'Valves'],
    'Anaerobic Digester': ['Mixing systems', 'Gas blowers', 'Heat exchangers', 'Valves', 'Pumps', 'Pressure relief valves'],
    'Centrifuge': ['Bearings', 'Drive belts', 'Feed pumps', 'Bowl and scroll assembly', 'Gearbox', 'Seals'],
    'Reverse Osmosis System': ['Membranes', 'Feed pumps', 'Pressure vessels', 'Valves', 'O-rings and gaskets', 'Sensors'],
    'Activated Carbon Filter': ['Filter media', 'Pumps', 'Valves', 'Backwash system components', 'Piping', 'Flow meters'],
    'Lift Station': ['Pumps', 'Valves', 'Level sensors', 'Float switches', 'Control panels', 'Bearings']
}

In [6]:
 #Define realistic actions for each type of maintenance
routine_maintenance_actions = [
    "Inspected equipment for wear and tear.",
    "Lubricated moving parts to ensure smooth operation.",
    "Tightened all bolts and fasteners.",
    "Replaced air filters and checked system airflow.",
    "Calibrated sensors to ensure accurate readings.",
    "Cleaned sediment from filters and chambers.",
    "Checked for leaks and repaired minor cracks.",
    "Replaced worn-out gaskets and seals.",
    "Flushed system to remove debris buildup.",
    "Tested performance metrics and adjusted settings.",
    "Replaced worn belts and hoses.",
    "Cleaned and aligned drive chains.",
    "Performed system diagnostics and recalibrated equipment.",
    "Tested alarm and safety systems for functionality.",
    "Checked and refilled hydraulic fluid.",
    "Cleared blockages from intake valves.",
    "Replaced faulty pressure sensors.",
    "Tightened or replaced loose fittings and connections.",
    "Checked for abnormal vibrations in rotating equipment.",
    "Reprogrammed system software for optimal performance."
    "Inspected and cleaned UV lamps for optimal performance.",
    "Checked and adjusted pH levels in treatment tanks.",
    "Replaced worn-out bearings in rotating equipment.",
    "Tested backup generators to ensure readiness.",
    "Cleaned and inspected sludge dewatering equipment.",
    "Checked and calibrated flow meters.",
    "Inspected and cleaned aeration diffusers.",
    "Replaced worn-out impellers in pumps.",
    "Checked and cleaned chemical dosing systems.",
    "Inspected and tightened electrical connections.",
    "Tested and adjusted chemical feed pumps.",
    "Cleaned and inspected grit removal systems.",
    "Checked and replaced worn-out seals in valves.",
    "Inspected and cleaned scum removal equipment.",
    "Tested and calibrated dissolved oxygen sensors.",
    "Checked and cleaned level sensors in tanks.",
    "Inspected and lubricated gearboxes.",
    "Checked and adjusted blower systems.",
    "Cleaned and inspected heat exchangers.",
    "Tested and adjusted sludge blanket levels."
]

emergency_repair_actions = [
    "Repaired broken pump due to motor failure.",
    "Replaced damaged filter cartridge caused by system overload.",
    "Fixed electrical wiring issue leading to system shutdown.",
    "Replaced cracked housing on UV Disinfection Unit.",
    "Repaired leaking valve in aeration system.",
    "Replaced motor on Centrifuge after overheating.",
    "Repaired mechanical failure in Lift Station pump.",
    "Emergency repair of membrane in Membrane Bioreactor.",
    "Replaced burnt-out fuse causing equipment to malfunction.",
    "Performed immediate repairs on a cracked pipe.",
    "Replaced damaged fan blades in the cooling system.",
    "Sealed a major leak in the chemical treatment system.",
    "Replaced malfunctioning PLC module.",
    "Repaired corroded electrical connections.",
    "Replaced faulty relay switch causing intermittent shutdowns.",
    "Repaired malfunctioning flow meters.",
    "Rebuilt motor drive system after critical failure.",
    "Fixed broken pressure relief valve."
    "Replaced failed blower motor causing aeration system shutdown.",
    "Fixed burst pipe in sludge transfer line.",
    "Repaired malfunctioning chemical dosing pump.",
    "Replaced damaged control panel due to electrical surge.",
    "Fixed broken agitator in mixing tank.",
    "Repaired leaking gasket in high-pressure pump.",
    "Replaced burnt-out UV lamp in disinfection unit.",
    "Fixed jammed conveyor belt in sludge handling system.",
    "Repaired cracked tank wall in primary clarifier.",
    "Replaced faulty PLC module causing system errors.",
    "Fixed broken drive shaft in aeration blower.",
    "Repaired damaged float switch in lift station.",
    "Replaced failed solenoid valve in chemical feed system.",
    "Fixed broken impeller in recirculation pump.",
    "Repaired leaking flange in effluent discharge line.",
    "Replaced damaged motor starter in control panel.",
    "Fixed broken chain in grit removal system.",
    "Repaired malfunctioning level sensor in digester.",
    "Replaced failed pressure relief valve in sludge system.",
    "Fixed broken coupling in aeration blower."
]

inspection_actions = [
    "Completed routine inspection of equipment.",
    "Checked sensor readings for accuracy.",
    "Inspected for any signs of corrosion or damage.",
    "Performed safety check on all electrical connections.",
    "Tested emergency shutdown system functionality.",
    "Checked fluid levels and replenished as needed.",
    "Reviewed equipment logs for irregularities.",
    "Conducted visual inspection for any loose connections.",
    "Tested pH levels in water treatment system.",
    "Reviewed recent equipment performance logs for anomalies.",
    "Inspected filter cartridges for clogs or damage.",
    "Verified calibration of pressure and flow sensors.",
    "Checked alignment of drive shafts and pumps.",
    "Reviewed backup system readiness in case of failure.",
    "Checked for excessive heat generation in electrical panels.",
    "Confirmed air pressure levels in pneumatic systems.",
    "Verified system alarms and communication systems are functioning properly."
    "Visually inspected all tanks for signs of corrosion or damage.",
    "Checked all pumps for abnormal noise or vibration.",
    "Inspected electrical panels for signs of overheating or loose connections.",
    "Reviewed system logs for any unusual activity or error messages.",
    "Checked all valves for leaks or signs of wear.",
    "Inspected aeration system for proper operation and airflow.",
    "Checked chemical storage tanks for leaks or contamination.",
    "Inspected UV disinfection units for proper operation and cleanliness.",
    "Checked all sensors for accurate readings and recalibrated if necessary.",
    "Inspected sludge handling equipment for blockages or wear.",
    "Checked all safety equipment for proper operation and readiness.",
    "Inspected piping for signs of leaks or damage.",
    "Checked all motors and gearboxes for proper lubrication and operation.",
    "Inspected heat exchangers for signs of fouling or leaks.",
    "Checked all alarms and emergency systems for proper operation.",
     "Inspected grit removal systems for blockages or wear.",
    "Checked all flow meters for accurate readings and recalibrated if necessary.",
    "Inspected chemical dosing pumps for proper operation and leaks.",
    "Checked all filters for debris buildup and cleaned or replaced as necessary.",
    "Inspected all drive chains and belts for proper tension and wear."
]

In [7]:
emergency_repair_notes = [
    "Repaired broken pump due to motor failure. Pump tested and returned to normal operation.",
    "Replaced damaged filter cartridge caused by system overload. System performance restored.",
    "Fixed electrical wiring issue leading to system shutdown. System restarted without issues.",
    "Replaced cracked housing on UV Disinfection Unit. Resumed normal operations after testing.",
    "Repaired leaking valve in aeration system. Leak stopped, system operating at optimal levels.",
    "Replaced motor on Centrifuge after overheating. Equipment tested for heat tolerance post-repair.",
    "Repaired mechanical failure in Lift Station pump. Pump back online and functioning properly.",
    "Emergency repair of membrane in Membrane Bioreactor. Checked and confirmed proper filtration.",
    "Replaced burnt-out fuse causing equipment to malfunction. Fuse box inspected for additional damage.",
    "Performed immediate repairs on a cracked pipe. Water flow restored and checked for further leaks.",
    "Replaced damaged fan blades in the cooling system. Tested cooling efficiency post-repair.",
    "Sealed a major leak in the chemical treatment system. Chemical levels stabilized after fix.",
    "Replaced malfunctioning PLC module. System reprogrammed and operational after repairs.",
    "Repaired corroded electrical connections. Inspected other connections for early signs of corrosion.",
    "Replaced faulty relay switch causing intermittent shutdowns. Continuous operation verified.",
    "Repaired malfunctioning flow meters. Flow readings back within acceptable ranges.",
    "Rebuilt motor drive system after critical failure. Motor functioning well post-repair.",
    "Fixed broken pressure relief valve. Pressure levels normalized after the repair.",
    "Blower motor replaced due to failure. Aeration system back online.",
    "Burst pipe in sludge transfer line fixed. System operating normally.",
    "Chemical dosing pump repaired. No further issues detected.",
    "Control panel replaced after electrical surge. System functioning properly.",
    "Agitator in mixing tank fixed. No further malfunctions.",
    "Leaking gasket in high-pressure pump repaired. No leaks detected.",
    "Burnt-out UV lamp replaced. Disinfection unit operational.",
    "Jammed conveyor belt in sludge handling system fixed. Operating smoothly.",
    "Cracked tank wall in primary clarifier repaired. No further issues.",
    "Faulty PLC module replaced. System errors resolved.",
    "Broken drive shaft in aeration blower fixed. System back online.",
    "Damaged float switch in lift station repaired. Functioning properly.",
    "Failed solenoid valve in chemical feed system replaced. No further issues.",
    "Broken impeller in recirculation pump fixed. System operating normally.",
    "Leaking flange in effluent discharge line repaired. No leaks detected.",
     "Damaged motor starter in control panel replaced. System functioning properly.",
    "Broken chain in grit removal system fixed. Operating smoothly.",
    "Malfunctioning level sensor in digester repaired. Accurate readings.",
    "Failed pressure relief valve in sludge system replaced. No further issues.",
    "Broken coupling in aeration blower fixed. System back online."
]

routine_maintenance_notes = [
    "Inspected equipment for wear and tear. No significant issues detected; equipment is in good condition.",
    "Lubricated moving parts to ensure smooth operation. Reduced friction and noise levels; operation is quieter.",
    "Tightened all bolts and fasteners. No loose components were found; equipment stability improved.",
    "Replaced air filters and checked system airflow. Airflow optimized; replaced filters were significantly clogged.",
    "Calibrated sensors to ensure accurate readings. Sensors are now providing accurate and consistent data.",
    "Cleaned sediment from filters and chambers. Improved filtration efficiency; system performance enhanced.",
    "Checked for leaks and repaired minor cracks. All leaks fixed; system pressure and performance stabilized.",
    "Replaced worn-out gaskets and seals. Sealed components effectively; prevented future leaks.",
    "Flushed system to remove debris buildup. Flow rate improved; system free of obstructions.",
    "Tested performance metrics and adjusted settings. Performance metrics within optimal ranges; settings fine-tuned.",
    "Replaced worn belts and hoses. Equipment now operates smoothly without slipping or leaks.",
    "Cleaned and aligned drive chains. Improved power transmission and reduced noise; alignment verified.",
    "Performed system diagnostics and recalibrated equipment. Equipment functioning correctly with updated diagnostics.",
    "Tested alarm and safety systems for functionality. All systems operational; no faults detected.",
    "Checked and refilled hydraulic fluid. Fluid levels are now at optimal levels; system pressure restored.",
    "Cleared blockages from intake valves. Increased intake efficiency; system running at full capacity.",
    "Replaced faulty pressure sensors. Sensors are now accurate; system pressure readings are reliable.",
    "Tightened or replaced loose fittings and connections. No further leaks or operational issues observed.",
    "Checked for abnormal vibrations in rotating equipment. Vibration levels normal; no imbalance detected.",
    "Reprogrammed system software for optimal performance. Software updated and running efficiently; performance improved.",
    "UV lamps inspected and cleaned. No signs of wear or damage.",
    "pH levels checked and adjusted. All readings within acceptable range.",
    "Bearings in rotating equipment replaced. No further issues detected.",
    "Backup generators tested and ready. No faults found.",
    "Sludge dewatering equipment cleaned and inspected. Operating normally.",
    "Flow meters checked and calibrated. Readings accurate.",
    "Aeration diffusers inspected and cleaned. No blockages found.",
    "Impellers in pumps replaced. System functioning optimally.",
    "Chemical dosing systems checked and cleaned. No issues detected.",
    "Electrical connections inspected and tightened. All secure.",
    "Chemical feed pumps tested and adjusted. Operating within parameters.",
    "Grit removal systems cleaned and inspected. No debris buildup.",
    "Seals in valves checked and replaced. No leaks detected.",
    "Scum removal equipment inspected and cleaned. Functioning properly.",
    "Dissolved oxygen sensors tested and calibrated. Accurate readings.",
    "Level sensors in tanks checked and cleaned. No issues found.",
    "Gearboxes inspected and lubricated. Operating smoothly.",
    "Blower systems checked and adjusted. No abnormalities detected.",
    "Heat exchangers cleaned and inspected. No signs of wear.",
    "Sludge blanket levels tested and adjusted. Within optimal range."
]

inspection_notes = [
    "Completed routine inspection of equipment. All units are operating within normal parameters; no immediate issues detected.",
    "Checked sensor readings for accuracy. Readings are accurate; recalibration is not required at this time.",
    "Inspected for any signs of corrosion or damage. Minor corrosion found on metal parts; scheduled for cleaning and treatment.",
    "Performed safety check on all electrical connections. Connections are secure; no safety hazards identified.",
    "Tested emergency shutdown system functionality. System tested successfully; emergency shutdown is operational.",
    "Checked fluid levels and replenished as needed. Fluid levels are optimal; no leaks or deficiencies found.",
    "Reviewed equipment logs for irregularities. Logs show normal operation; no irregularities detected.",
    "Conducted visual inspection for any loose connections. All connections are tight and secure; no maintenance required.",
    "Tested pH levels in water treatment system. pH levels are within the acceptable range; no action required.",
    "Reviewed recent equipment performance logs for anomalies. Performance is consistent with expected norms; no anomalies found.",
    "Inspected filter cartridges for clogs or damage. Cartridges are clean with minimal debris; replaced as a precaution.",
    "Verified calibration of pressure and flow sensors. Sensors are properly calibrated; readings are within expected ranges.",
    "Checked alignment of drive shafts and pumps. Alignment is correct; no signs of misalignment or wear.",
    "Reviewed backup system readiness in case of failure. Backup systems are functional and ready for use if needed.",
    "Checked for excessive heat generation in electrical panels. No excessive heat detected; panels are operating within safe temperatures.",
    "Confirmed air pressure levels in pneumatic systems. Air pressure is within optimal range; systems are functioning correctly.",
    "Verified system alarms and communication systems are functioning properly. All alarms and communication systems are operational with no faults.",
     "Tanks visually inspected. No signs of corrosion or damage detected.",
    "Pumps checked. No abnormal noise or vibration observed.",
    "Electrical panels inspected. No signs of overheating or loose connections.",
    "System logs reviewed. No unusual activity or error messages found.",
    "Valves checked. No leaks or signs of wear detected.",
    "Aeration system inspected. Proper operation and airflow confirmed.",
    "Chemical storage tanks checked. No leaks or contamination detected.",
    "UV disinfection units inspected. Proper operation and cleanliness confirmed.",
    "Sensors checked. Accurate readings, no recalibration needed.",
    "Sludge handling equipment inspected. No blockages or wear detected.",
    "Safety equipment checked. All systems operational and ready.",
    "Piping inspected. No signs of leaks or damage detected.",
    "Motors and gearboxes checked. Proper lubrication and operation confirmed.",
    "Heat exchangers inspected. No signs of fouling or leaks detected.",
    "Alarms and emergency systems checked. All systems operational.",
    "Grit removal systems inspected. No blockages or wear detected.",
    "Flow meters checked. Accurate readings, no recalibration needed.",
    "Chemical dosing pumps inspected. Proper operation and no leaks detected.",
    "Filters checked. Minor debris buildup, cleaned and replaced as necessary.",
    "Drive chains and belts inspected. Proper tension and no wear detected."
]

In [8]:
def get_relevant_parts(equipment_type, action, num_parts=3):

    all_parts = equipment_parts[equipment_type]
    action_words = set(re.findall(r'\b\w+\b', action.lower()))

    # First, get parts that match the action
    relevant_parts = [part for part in all_parts if any(word in part.lower() for word in action_words)]

    # If we don't have enough relevant parts, add random parts from the equipment
    while len(relevant_parts) < num_parts and len(relevant_parts) < len(all_parts):
        random_part = random.choice(all_parts)
        if random_part not in relevant_parts:
            relevant_parts.append(random_part)

    return relevant_parts[:num_parts]

In [9]:
def create_action_note_pairs(actions, notes):
    pairs = []
    for action in actions:
        best_match = max(notes, key=lambda note: len(set(action.lower().split()) & set(note.lower().split())))
        pairs.append((action, best_match))
    return pairs

routine_maintenance_pairs = create_action_note_pairs(routine_maintenance_actions, routine_maintenance_notes)
emergency_repair_pairs = create_action_note_pairs(emergency_repair_actions, emergency_repair_notes)
inspection_pairs = create_action_note_pairs(inspection_actions, inspection_notes)

In [10]:
import random
from datetime import datetime, time

def create_observation(equipment, maintenance_type, date):
    work_order_id = str(fake.uuid4())
    maintenance_log_id = str(fake.uuid4())
    status_report_id = str(fake.uuid4())

    if maintenance_type == 'Routine Maintenance':
        action, note = random.choice(routine_maintenance_pairs)
    elif maintenance_type == 'Emergency Repair':
        action, note = random.choice(emergency_repair_pairs)
    else:  # Inspection
        action, note = random.choice(inspection_pairs)

    relevant_parts = get_relevant_parts(equipment['type'], action)
    parts_used = ', '.join(relevant_parts) if relevant_parts else 'No specific parts'

    # Generate a random time
    random_time = time(random.randint(0, 23), random.randint(0, 59), random.randint(0, 59))

    # Combine date and random time
    timestamp = datetime.combine(date, random_time)

    # Format timestamp to include a space instead of 'T'
    formatted_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")

    return {
        'work_order_id': work_order_id,
        'equipment_id': equipment['id'],
        'maintenance_log_id': maintenance_log_id,
        'status_report_id': status_report_id,
        'timestamp': formatted_timestamp,  # Use the formatted timestamp
        'technician': fake.name(),
        'maintenance_type': maintenance_type,
        'action_taken': action,
        'parts_used': parts_used,
        'notes': note,
        'equipment_name': equipment['name'],
        'equipment_type': equipment['type'],
        'equipment_status': equipment['status'],
        'performance_metric': round(random.uniform(0.7, 1.0), 2)
    }

In [11]:
def generate_data(num_equipment, start_date, end_date, target_observations):
    combined_data = []
    total_observations = 0

    start_date = datetime.combine(start_date, datetime.min.time())
    end_date = datetime.combine(end_date, datetime.min.time())

    for _ in range(num_equipment):
        equip_type = random.choice(list(equipment_types.keys()))
        equipment = {
            'id': str(fake.uuid4()),
            'name': f"{equip_type}-{fake.random_int(min=1000, max=999999)}",
            'type': equip_type,
            'installation_date': fake.date_between(start_date='-100y', end_date='-1y'),
            'last_maintenance': fake.date_between(start_date='-1y', end_date='today'),
            'status': random.choices(['Operational', 'Needs Maintenance', 'Under Repair'], weights=[0.8, 0.15, 0.05])[0]
        }

        maintenance_date = datetime.combine(equipment['last_maintenance'], datetime.min.time())
        while maintenance_date <= end_date:
            if maintenance_date >= start_date:
                # Routine Maintenance
                observation = create_observation(equipment, 'Routine Maintenance', maintenance_date)
                combined_data.append(observation)
                total_observations += 1

                # Inspection (50% chance)
                if random.random() < 0.5:
                    inspection_observation = create_observation(equipment, 'Inspection', maintenance_date)
                    combined_data.append(inspection_observation)
                    total_observations += 1

            maintenance_date += timedelta(days=equipment_types[equipment['type']]['maintenance_interval'])

        # Emergency repairs
        current_date = start_date
        while current_date <= end_date:
            if random.random() < equipment_types[equipment['type']]['failure_rate']:
                emergency_observation = create_observation(equipment, 'Emergency Repair', current_date)
                combined_data.append(emergency_observation)
                total_observations += 1

            current_date += timedelta(days=1)

        if total_observations >= target_observations:
            break

    return combined_data, total_observations

In [12]:
# Main execution
num_equipment = 1000
start_date = datetime(1700, 1, 1)
end_date = datetime(2024, 5, 31)
target_observations = 1000000

In [13]:
print("Generating data...")
combined_data, total_observations = generate_data(num_equipment, start_date, end_date, target_observations)

print(f"Total observations generated: {total_observations}")

Generating data...
Total observations generated: 470580


In [14]:
# Save the combined JSON file
json_file_path = os.path.join(folder_path, 'wastewater_treatment_data.json')
with open(json_file_path, 'w') as f:
    json.dump(combined_data, f, default=str, indent=2)

print(f"Combined JSON data saved to: {json_file_path}")


Combined JSON data saved to: C:\Users\yomid\OneDrive - Case Western Reserve University\Desktop\KaggleX\data_trial\wastewater_treatment_data.json


In [15]:
# Save individual CSV files
pd.DataFrame(combined_data).to_csv(os.path.join(folder_path, 'wastewater_treatment_data.csv'), index=False)

print("CSV file saved in the specified folder.")

CSV file saved in the specified folder.


In [16]:
# Separate the data into different categories
'''
The several data categories are saved into individual CSV files before being merged together.
'''
equipment_data = []
work_orders_data = []
maintenance_logs_data = []
status_reports_data = []

for item in combined_data:
    equipment_data.append({
        'id': item['equipment_id'],
        'name': item['equipment_name'],
        'type': item['equipment_type'],
        'status': item['equipment_status']
    })
    work_orders_data.append({
        'id': item['work_order_id'],
        'equipment_id': item['equipment_id'],
        'timestamp': item['timestamp'],
        'maintenance_type': item['maintenance_type']
    })
    maintenance_logs_data.append({
        'id': item['maintenance_log_id'],
        'work_order_id': item['work_order_id'],
        'technician': item['technician'],
        'action_taken': item['action_taken'],
        'parts_used': item['parts_used'],
        'notes': item['notes']
    })
    status_reports_data.append({
        'id': item['status_report_id'],
        'equipment_id': item['equipment_id'],
        'timestamp': item['timestamp'],
        'performance_metric': item['performance_metric']
    })

# Save separate CSV files
pd.DataFrame(equipment_data).to_csv(os.path.join(folder_path, 'equipment.csv'), index=False)
pd.DataFrame(work_orders_data).to_csv(os.path.join(folder_path, 'work_orders.csv'), index=False)
pd.DataFrame(maintenance_logs_data).to_csv(os.path.join(folder_path, 'maintenance_logs.csv'), index=False)
pd.DataFrame(status_reports_data).to_csv(os.path.join(folder_path, 'status_reports.csv'), index=False)

print("CSV files saved in the specified folder.")

CSV files saved in the specified folder.


Fine_Tuned_Dataset

In [17]:
def create_fine_tuning_entry(maintenance_log):
    # Generate a question based on the maintenance action
    '''
    This funtion is used to create a fine-tuned entry for training the gemma llm model
    it follows the actual format that was used.
    '''
    instruction = f"What maintenance action was performed on the {maintenance_log['equipment_name']} on {maintenance_log['timestamp'].split()[0]}?"

    # Create a detailed context from the maintenance log
    context = (f"On {maintenance_log['timestamp']}, technician {maintenance_log['technician']} performed a "
               f"{maintenance_log['maintenance_type'].lower()} on {maintenance_log['equipment_name']}. "
               f"The action taken was to {maintenance_log['action_taken'].lower()} "
               f"Parts used included {maintenance_log['parts_used']}. "
               f"The notes state: '{maintenance_log['notes']}' "
               f"The equipment status was {maintenance_log['equipment_status']} with a performance metric of {maintenance_log['performance_metric']}.")

    # Generate a concise response
    response = f"A {maintenance_log['maintenance_type'].lower()} was performed on {maintenance_log['equipment_name']}, where {maintenance_log['action_taken'].lower()}"

    # Set the category
    category = "closed_qa"

    return {
        "instruction": instruction,
        "context": context,
        "response": response,
        "category": category
    }

In [None]:
# def create_fine_tuning_entry(maintenance_log):
#     '''
#     This function is used to create a fine-tuned entry for training the gemma llm model.
#     It follows the actual format that was used and includes various types of questions.
#     '''
#     question_type = random.choice(['action', 'next_maintenance', 'maintenance_type'])

#     if question_type == 'action':
#         instruction = f"What maintenance action was performed on the {maintenance_log['equipment_name']} on {maintenance_log['timestamp'].split()[0]}?"
#         response = f"A {maintenance_log['maintenance_type'].lower()} was performed on {maintenance_log['equipment_name']}, where {maintenance_log['action_taken'].lower()}"
#     elif question_type == 'next_maintenance':
#         instruction = f"When should the next maintenance be performed on the {maintenance_log['equipment_name']}?"
#         next_maintenance_date = (datetime.strptime(maintenance_log['timestamp'], "%Y-%m-%d %H:%M:%S") + 
#                                  timedelta(days=equipment_types[maintenance_log['equipment_type']]['maintenance_interval'])).strftime("%Y-%m-%d")
#         response = f"The next maintenance for {maintenance_log['equipment_name']} should be performed on or around {next_maintenance_date}, based on the standard maintenance interval for {maintenance_log['equipment_type']}."
#     else:  # maintenance_type
#         instruction = f"What kind of maintenance would be needed for {maintenance_log['equipment_name']} given its current status?"
#         if maintenance_log['equipment_status'] == 'Operational':
#             response = f"Given that the {maintenance_log['equipment_name']} is currently operational, it would likely need routine maintenance to ensure continued optimal performance."
#         elif maintenance_log['equipment_status'] == 'Needs Maintenance':
#             response = f"The {maintenance_log['equipment_name']} needs maintenance. A thorough inspection and necessary repairs or part replacements should be performed."
#         else:  # 'Under Repair'
#             response = f"The {maintenance_log['equipment_name']} is currently under repair. Once repairs are complete, a full inspection and test run should be conducted before returning it to operational status."

#     context = (f"On {maintenance_log['timestamp']}, technician {maintenance_log['technician']} performed a "
#                f"{maintenance_log['maintenance_type'].lower()} on {maintenance_log['equipment_name']}. "
#                f"The action taken was to {maintenance_log['action_taken'].lower()} "
#                f"Parts used included {maintenance_log['parts_used']}. "
#                f"The notes state: '{maintenance_log['notes']}' "
#                f"The equipment status was {maintenance_log['equipment_status']} with a performance metric of {maintenance_log['performance_metric']}. "
#                f"The standard maintenance interval for {maintenance_log['equipment_type']} is {equipment_types[maintenance_log['equipment_type']]['maintenance_interval']} days.")

#     category = "closed_qa"

#     return {
#         "instruction": instruction,
#         "context": context,
#         "response": response,
#         "category": category
#     }

In [20]:
def generate_fine_tuning_data(num_equipment, start_date, end_date, target_observations):
    '''
    This funtion is used to generate a fine-tuned dataset for training the gemma llm model
    it follows the actual format that was used.
    '''
    maintenance_logs, total_observations = generate_data(num_equipment, start_date, end_date, target_observations)

    # Convert to numpy array for shuffling
    maintenance_logs_array = np.array(maintenance_logs)

    # Shuffle the array
    np.random.shuffle(maintenance_logs_array)

    # Convert back to list
    shuffled_maintenance_logs = maintenance_logs_array.tolist()

    fine_tuning_data = [create_fine_tuning_entry(log) for log in shuffled_maintenance_logs]
    return fine_tuning_data, total_observations


# Usage
num_equipment = 1000
start_date = datetime(2012, 1, 1)
end_date = datetime(2023, 12, 31)
target_observations = 1000000

fine_tuning_data, total_observations = generate_fine_tuning_data(num_equipment, start_date, end_date, target_observations)

print(f"Total observations generated: {total_observations}")

# Save the fine-tuning data
with open(os.path.join(folder_path, 'fine_tuning_data.json'), 'w') as f:
    json.dump(fine_tuning_data, f, indent=2)

print(f"Fine-tuning data saved to: {os.path.join(folder_path, 'fine_tuning_data.json')}")

Total observations generated: 18768
Fine-tuning data saved to: C:\Users\yomid\OneDrive - Case Western Reserve University\Desktop\KaggleX\data_trial\fine_tuning_data.json


In [19]:
file_path = os.path.join(folder_path, 'fine_tuning_data.json')
with open(file_path, 'r') as f:
    data = json.load(f)

print(json.dumps(data[:5], indent=2))

[
  {
    "instruction": "What maintenance action was performed on the Anaerobic Digester-587249 on 1732-09-09?",
    "context": "On 1732-09-09 21:07:23, technician Steven Marshall performed a emergency repair on Anaerobic Digester-587249. The action taken was to repaired leaking gasket in high-pressure pump. Parts used included Mixing systems, Pumps, Pressure relief valves. The notes state: 'Leaking gasket in high-pressure pump repaired. No leaks detected.' The equipment status was Operational with a performance metric of 0.93.",
    "response": "A emergency repair was performed on Anaerobic Digester-587249, where repaired leaking gasket in high-pressure pump.",
    "category": "closed_qa"
  },
  {
    "instruction": "What maintenance action was performed on the Lift Station-195230 on 1835-11-19?",
    "context": "On 1835-11-19 05:45:58, technician Curtis Hernandez performed a emergency repair on Lift Station-195230. The action taken was to replaced failed solenoid valve in chemical f

In [22]:
# # Alternatively, if you want to print just the structure without all the data:
print("\nStructure of a single item:")
print(json.dumps({k: type(v).__name__ for k, v in data[0].items()}, indent=2))


Structure of a single item:
{
  "instruction": "str",
  "context": "str",
  "response": "str",
  "category": "str"
}


In [20]:
# def generate_fine_tuning_data(num_equipment, start_date, end_date, target_observations):
#     maintenance_logs, total_observations = generate_data(num_equipment, start_date, end_date, target_observations)
#     fine_tuning_data = [create_fine_tuning_entry(log) for log in maintenance_logs]
#     return fine_tuning_data, total_observations

# # Usage
# num_equipment = 1000
# start_date = datetime(1700, 1, 1)
# end_date = datetime(2023, 12, 31)
# target_observations = 1000000

# fine_tuning_data, total_observations = generate_fine_tuning_data(num_equipment, start_date, end_date, target_observations)

# print(f"Total observations generated: {total_observations}")

# # Save the fine-tuning data
# with open(os.path.join(folder_path, 'fine_tuning_data.json'), 'w') as f:
#     json.dump(fine_tuning_data, f, indent=2)

# print(f"Fine-tuning data saved to: {os.path.join(folder_path, 'fine_tuning_data.json')}")

In [21]:
# import json

# # After saving the JSON file
# json_file_path = os.path.join(folder_path, 'wastewater_treatment_data.json')

# # Read and print the first few lines of the JSON file
# with open(json_file_path, 'r') as f:
#     # Read the first 5 items from the JSON file
#     data = json.load(f)
#     print("First 5 items in the JSON file:")
#     print(json.dumps(data[:5], indent=2))

# # Alternatively, if you want to print just the structure without all the data:
# print("\nStructure of a single item:")
# print(json.dumps({k: type(v).__name__ for k, v in data[0].items()}, indent=2))