Import libraries

In [1]:
!pip install faker



In [1]:
import os
import json
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, time
import random
import re

fake = Faker()

In [2]:
# Define the folder path
folder_path = r"C:\Users\yomid\OneDrive - Case Western Reserve University\Desktop\KaggleX\Stormwater_Data_updated"

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

In [3]:
equipment_types = {
    'Flow Meters': {'lifespan': 15, 'maintenance_interval': 60, 'failure_rate': 0.005},
    'Inspection Cameras': {'lifespan': 10, 'maintenance_interval': 30, 'failure_rate': 0.008},
    'Drones': {'lifespan': 8, 'maintenance_interval': 20, 'failure_rate': 0.010},
    'Safety Harnesses': {'lifespan': 10, 'maintenance_interval': 24, 'failure_rate': 0.004},
    'Flashlights': {'lifespan': 5, 'maintenance_interval': 12, 'failure_rate': 0.003},
    'Manhole Covers': {'lifespan': 50, 'maintenance_interval': 120, 'failure_rate': 0.001},
    'Battery Backups': {'lifespan': 7, 'maintenance_interval': 24, 'failure_rate': 0.006},
    'Sediment Pumps': {'lifespan': 12, 'maintenance_interval': 36, 'failure_rate': 0.009},
    'Flow Control Valves': {'lifespan': 20, 'maintenance_interval': 48, 'failure_rate': 0.007},
    'Emergency Response Kits': {'lifespan': 5, 'maintenance_interval': 12, 'failure_rate': 0.002}
}

In [4]:
equipment_parts = {
    'Flow Meters': ['Sensor', 'Display unit', 'Calibration tools', 'Flow tube', 'Mounting brackets', 'Battery'],
    
    'Inspection Cameras': ['Camera head', 'Cable reel', 'Controller unit', 'Lighting system', 'Battery pack', 'Monitor'],
    
    'Drones': ['Propellers', 'Battery', 'Camera', 'GPS module', 'Remote controller', 'Sensors'],
    
    'Safety Harnesses': ['Straps', 'Buckle', 'D-ring', 'Padding', 'Connector'],
    
    'Flashlights': ['Bulb', 'Reflector', 'Lens', 'Body casing', 'Battery compartment'],
    
    'Manhole Covers': ['Lid', 'Frame', 'Lifting keyholes', 'Sealing gasket', 'Bolts', 'Safety grates'],
    
    'Battery Backups': ['Battery cells', 'Charger', 'Inverter', 'Battery management system', 'Housing'],
    
    'Sediment Pumps': ['Impeller', 'Motor', 'Discharge hose', 'Suction screen', 'Float switch'],
    
    'Flow Control Valves': ['Valve body', 'Actuator', 'Handle', 'Seals', 'Mounting flange'],
    
    'Emergency Response Kits': ['Flashlight', 'First aid supplies', 'Multi-tool', 'Warning signs', 'Emergency blankets']
}

In [5]:
monitoring_actions = [
    "Regularly monitoring inflow and outflow rates using flow meters to maintain system balance.",
    "Calibrating flow meters periodically to ensure precise water flow measurements.",
    "Scheduling emptying of retention basins to maintain capacity for stormwater runoff.",
    "Inspecting sensor data monthly to prevent inaccuracies during heavy rainfall events.",
    "Checking drainage systems after major storms to identify and clear blockages.",
    "Maintaining a performance log for all equipment to identify potential failures.",
    "Conducting quarterly tests of pump stations to confirm activation during high water levels.",
    "Verifying the functionality of battery backups to ensure operation during power outages.",
    "Inspecting joints and seals on pipes to detect wear and tear, preventing leakages.",
    "Conducting regular inspections of grates and screens to remove accumulated debris.",
    "Monitoring metal components for signs of corrosion, ensuring long-term reliability.",
    "Performing flow tests during dry periods to maintain consistent water movement.",
    "Carrying out annual audits of the stormwater system to assess performance and efficiency.",
    "Monitoring water quality in retention basins to detect and manage potential contaminants.",
    "Conducting functionality tests on alarm systems to ensure immediate flood alerts.",
    "Reviewing weekly data logs to identify unusual water level patterns for proactive measures.",
    "Checking sediment levels in detention ponds to schedule necessary removal.",
    "Ensuring all access points are clear and free from vegetation or obstructions.",
    "Monitoring the operation of control valves to manage water flow effectively.",
    "Inspecting safety barriers and warning signs around stormwater infrastructure for visibility."
]

monitoring_notes = [
    "Inflow and outflow rates are consistently monitored, ensuring system balance is maintained.",
    "Flow meters are successfully calibrated, achieving accurate water flow measurements.",
    "Retention basins are emptied as scheduled, maintaining optimal capacity for stormwater runoff.",
    "Monthly inspections of sensor data reveal no inaccuracies during heavy rainfall events.",
    "Post-storm checks of drainage systems identify and clear all blockages effectively.",
    "The performance log is maintained diligently, allowing for early detection of potential equipment failures.",
    "Quarterly tests confirm that pump stations activate properly during high water levels.",
    "Battery backups are verified to be fully operational, ensuring continuity during power outages.",
    "Inspections reveal no significant wear on joints and seals of pipes, preventing leakages.",
    "Regular inspections of grates and screens result in the removal of all accumulated debris.",
    "No signs of corrosion are detected on metal components, ensuring long-term reliability.",
    "Flow tests conducted during dry periods confirm consistent water movement.",
    "Annual audits indicate that the stormwater system is performing efficiently.",
    "Water quality in retention basins is monitored, revealing no significant contaminants.",
    "Functionality tests on alarm systems confirm that immediate flood alerts are operational.",
    "Weekly data log reviews identify no unusual patterns in water levels.",
    "Sediment levels in detention ponds are checked, and removal is scheduled for next week.",
    "All access points are clear and free from vegetation, ensuring easy accessibility.",
    "Operation of control valves is monitored, showing effective management of water flow.",
    "Safety barriers and warning signs are inspected, confirming visibility and readiness."
]

inspection_actions = [
    "Removing debris from multiple grates to restore normal water flow.",
    "Adjusting the flow meter calibration to correct the observed deviation.",
    "Inspecting all drainage channels to check for potential blockages.",
    "Securing and checking manhole covers for any signs of damage or tampering.",
    "Repairing the minor crack found on the concrete lining of the retention basin.",
    "Manually activating the pump station to test its response to simulated flooding.",
    "Inspecting safety harnesses and flashlights for readiness and functionality.",
    "Scheduling a cleanout operation for the sediment buildup in the detention pond.",
    "Testing the alarm system to ensure flood alerts are functioning correctly.",
    "Clearing vegetation around drainage outlets to prevent potential blockages.",
    "Checking metal components for corrosion to ensure structural integrity.",
    "Using inspection cameras to visualize internal pipe conditions for damage.",
    "Reinforcing eroded sections of the embankment to prevent further deterioration.",
    "Charging and testing the battery backup system for emergency power supply.",
    "Inspecting outfall structures to confirm they are free of obstructions.",
    "Cleaning minor debris from grate screens to maintain water flow.",
    "Verifying the contents of the emergency response kit and ensuring accessibility.",
    "Conducting a dry run of the flood response plan to check for leaks or issues.",
    "Opening all access points to verify ease of inspection.",
    "Testing the flow control valves for smooth operation and responsiveness."
]

inspection_notes = [
    "Debris is removed from multiple grates, restoring normal water flow.",
    "The flow meter calibration is adjusted to correct the observed deviation.",
    "All drainage channels are inspected, and no blockages are detected.",
    "Manhole covers are secured and checked for signs of damage or tampering.",
    "A minor crack is repaired on the concrete lining of the retention basin.",
    "The pump station responds correctly to the manual test activation.",
    "Safety harnesses and flashlights are found to be in good working condition.",
    "Sediment buildup is noted in the detention pond; a cleanout is scheduled.",
    "The alarm system is tested, and the alerts are received promptly.",
    "Vegetation around drainage outlets is cleared to prevent blockages.",
    "No signs of corrosion are detected on metal components during inspection.",
    "Inspection cameras provide clear visuals, indicating no internal pipe damage.",
    "Erosion is observed on a small section of the embankment and is reinforced.",
    "The battery backup system is fully charged and operational.",
    "Outfall structures are checked and found to be free of obstructions.",
    "Grate screens have minor debris removed, maintaining water flow.",
    "The emergency response kit is complete and easily accessible.",
    "No leaks are detected during the dry run of the flood response plan.",
    "All access points are clear and easy to open for inspection.",
    "The flow control valves are operating smoothly without any resistance."
]


In [6]:
def get_relevant_parts(stormwater_equipment, action, num_parts=3):

    all_parts = equipment_parts[stormwater_equipment]
    action_words = set(re.findall(r'\b\w+\b', action.lower()))

    # First, get parts that match the action
    relevant_parts = [part for part in all_parts if any(word in part.lower() for word in action_words)]

    # If we don't have enough relevant parts, add random parts from the equipment
    while len(relevant_parts) < num_parts and len(relevant_parts) < len(all_parts):
        random_part = random.choice(all_parts)
        if random_part not in relevant_parts:
            relevant_parts.append(random_part)

    return relevant_parts[:num_parts]

In [7]:
def create_action_note_pairs(actions, notes):
    pairs = []
    for action in actions:
        best_match = max(notes, key=lambda note: len(set(action.lower().split()) & set(note.lower().split())))
        pairs.append((action, best_match))
    return pairs


inspection_pairs = create_action_note_pairs(inspection_actions, inspection_notes)
monitoring_pairs = create_action_note_pairs(monitoring_actions, monitoring_notes)

In [8]:
def create_observation(equipment, maintenance_type, date):
    work_order_id = str(fake.uuid4())
    maintenance_log_id = str(fake.uuid4())
    status_report_id = str(fake.uuid4())

   
    if maintenance_type == "Monitoring":
        action, note = random.choice(monitoring_pairs)
    else:  # Inspection
        action, note = random.choice(inspection_pairs)

    relevant_parts = get_relevant_parts(equipment['type'], action)
    parts_used = ', '.join(relevant_parts) if relevant_parts else 'No specific parts'

    # Generate a random time
    random_time = time(random.randint(0, 23), random.randint(0, 59), random.randint(0, 59))

    # Combine date and random time
    timestamp = datetime.combine(date, random_time)

    return {
        'work_order_id': work_order_id,
        'equipment_id': equipment['id'],
        'maintenance_log_id': maintenance_log_id,
        'status_report_id': status_report_id,
        'timestamp': timestamp.strftime("%Y-%m-%d %H:%M:%S"),
        'technician': fake.name(),
        'maintenance_type': maintenance_type,
        'action_taken': action,
        'parts_used': parts_used,
        'notes': note,
        'equipment_name': equipment['name'],
        'equipment_type': equipment['type'],
        'equipment_status': equipment['status'],
        'performance_metric': round(random.uniform(0.7, 1.0), 2),
        'installation_date': equipment['installation_date'].strftime("%Y-%m-%d")  # Add this line
    }

In [9]:
def generate_data(num_equipment, start_date, end_date, target_observations):
    combined_data = []
    total_observations = 0

    start_date = datetime.combine(start_date, datetime.min.time())
    end_date = datetime.combine(end_date, datetime.min.time())

    for _ in range(num_equipment):
        equip_type = random.choice(list(equipment_types.keys()))
        equipment = {
            'id': str(fake.uuid4()),
            'name': f"{equip_type}-{fake.random_int(min=1000, max=999999)}",
            'type': equip_type,
            'installation_date': fake.date_between(start_date='-100y', end_date='-1y'),
            'last_maintenance': fake.date_between(start_date='-1y', end_date='today'),
            'status': random.choices(['Operational', 'Needs Maintenance', 'Under Repair'], weights=[0.8, 0.15, 0.05])[0]
        }

        maintenance_date = datetime.combine(equipment['last_maintenance'], datetime.min.time())
        while maintenance_date <= end_date:
            if maintenance_date >= start_date:
                observation = create_observation(equipment, 'Routine Maintenance', maintenance_date)
                combined_data.append(observation)
                total_observations += 1

                if random.random() < 0.5:
                    inspection_observation = create_observation(equipment, 'Inspection', maintenance_date)
                    combined_data.append(inspection_observation)
                    total_observations += 1

            maintenance_date += timedelta(days=equipment_types[equipment['type']]['maintenance_interval'])

            
        # Monitoring
        current_date = start_date
        while current_date <= end_date:
            if random.random() < equipment_types[equipment['type']]['failure_rate']:
                monitoring_observation = create_observation(equipment, 'Monitoring', current_date)
                combined_data.append(monitoring_observation)
                total_observations += 1

            current_date += timedelta(days=1)

        if total_observations >= target_observations:
            break

    return combined_data, total_observations  # Make sure to return these values

In [10]:
# Main execution
num_equipment = 1000
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 5, 31)
target_observations = 1000000

In [11]:
print("Generating data...")
combined_data, total_observations = generate_data(num_equipment, start_date, end_date, target_observations)

print(f"Total observations generated: {total_observations}")

Generating data...
Total observations generated: 14691


In [12]:
# Save the combined JSON file
json_file_path = os.path.join(folder_path, 'stormwater_treatment_data.json')
with open(json_file_path, 'w') as f:
    json.dump(combined_data, f, default=str, indent=2)

print(f"Combined JSON data saved to: {json_file_path}")

Combined JSON data saved to: C:\Users\yomid\OneDrive - Case Western Reserve University\Desktop\KaggleX\Stormwater_Data_updated\stormwater_treatment_data.json


In [13]:
# Separate the data into different categories
'''
The several data categories are saved into individual CSV files before being merged together.
'''
equipment_data = []
work_orders_data = []
maintenance_logs_data = []
status_reports_data = []

for item in combined_data:
    equipment_data.append({
        'id': item['equipment_id'],
        'name': item['equipment_name'],
        'type': item['equipment_type'],
        'status': item['equipment_status']
    })
    work_orders_data.append({
        'id': item['work_order_id'],
        'equipment_id': item['equipment_id'],
        'timestamp': item['timestamp'],
        'maintenance_type': item['maintenance_type']
    })
    maintenance_logs_data.append({
        'id': item['maintenance_log_id'],
        'work_order_id': item['work_order_id'],
        'technician': item['technician'],
        'action_taken': item['action_taken'],
        'parts_used': item['parts_used'],
        'notes': item['notes']
    })
    status_reports_data.append({
        'id': item['status_report_id'],
        'equipment_id': item['equipment_id'],
        'timestamp': item['timestamp'],
        'performance_metric': item['performance_metric']
    })

# Save separate CSV files
pd.DataFrame(equipment_data).to_csv(os.path.join(folder_path, 'equipment.csv'), index=False)
pd.DataFrame(work_orders_data).to_csv(os.path.join(folder_path, 'work_orders.csv'), index=False)
pd.DataFrame(maintenance_logs_data).to_csv(os.path.join(folder_path, 'maintenance_logs.csv'), index=False)
pd.DataFrame(status_reports_data).to_csv(os.path.join(folder_path, 'status_reports.csv'), index=False)

print("CSV files saved in the specified folder.")

CSV files saved in the specified folder.


Fine_Tuned_Dataset

In [14]:
def create_fine_tuning_entry(maintenance_log):
    # Generate a question based on the maintenance action
    '''
    This funtion is used to create a fine-tuned entry for training the gemma llm model
    it follows the actual format that was used.
    '''
    instruction = f"What maintenance action was performed on the {maintenance_log['equipment_name']} on {maintenance_log['timestamp'].split()[0]}?"

    # Create a detailed context from the maintenance log
    context = (f"On {maintenance_log['timestamp']}, technician {maintenance_log['technician']} performed a "
               f"{maintenance_log['maintenance_type'].lower()} on {maintenance_log['equipment_name']}. "
               f"The action taken was to {maintenance_log['action_taken'].lower()} "
               f"Parts used included {maintenance_log['parts_used']}. "
               f"The notes state: '{maintenance_log['notes']}' "
               f"The equipment status was {maintenance_log['equipment_status']} with a performance metric of {maintenance_log['performance_metric']}.")

    # Generate a concise response
    response = f"A {maintenance_log['maintenance_type'].lower()} was performed on {maintenance_log['equipment_name']}, where {maintenance_log['action_taken'].lower()}"

    # Set the category
    category = "closed_qa"

    return {
        "instruction": instruction,
        "context": context,
        "response": response,
        "category": category
    }

In [15]:
# def create_fine_tuning_entry(maintenance_log):
#     '''
#     This function creates a fine-tuned entry for training the gemma llm model.
#     It incorporates equipment lifespan information for more realistic maintenance predictions.
#     '''
#     question_type = random.choice(['action', 'next_maintenance', 'maintenance_type', 'lifespan'])

#     equipment_type = maintenance_log['equipment_type']
#     equipment_name = maintenance_log['equipment_name']
#     last_service_date = datetime.strptime(maintenance_log['timestamp'].split()[0], "%Y-%m-%d")
#     installation_date = datetime.strptime(maintenance_log['installation_date'], "%Y-%m-%d")

#     lifespan = equipment_types[equipment_type]['lifespan']
#     maintenance_interval = equipment_types[equipment_type]['maintenance_interval']

#     if question_type == 'action':
#         instruction = f"What maintenance action was performed on the {equipment_name} on {last_service_date.strftime('%Y-%m-%d')}?"
#         response = f"A {maintenance_log['maintenance_type'].lower()} was performed on {equipment_name}, where {maintenance_log['action_taken'].lower()}"
#     elif question_type == "monitoring":
#         instruction = f"What monitoring data was recorded for the {equipment_name} on {last_monitoring_date.strftime('%Y-%m-%d')}?"
#         response = f"During the monitoring of {equipment_name}, the following was observed: {monitoring_log['observation'].lower()}. The recorded values were {monitoring_log['recorded_values']}."
#     elif question_type == 'outlet_control':
#         instruction = f"Based on the predicted rainfall, at what rate should the outlet for {equipment_name} be opened to prevent flooding?"
#         if predicted_rainfall < 50:  # Light rain
#             response = f"With {predicted_rainfall} mm of predicted rain, the outlet for {equipment_name} should be opened at a rate of 0.5 to 2 m³/s to allow slower drainage and prevent flooding."
#         elif 50 <= predicted_rainfall < 150:  # Moderate rain
#             response = f"With {predicted_rainfall} mm of predicted rain, the outlet for {equipment_name} should be opened at a rate of 5 to 10 m³/s to manage the water flow effectively and prevent flooding."
#         elif predicted_rainfall >= 150:  # Heavy rain
#             response = f"With {predicted_rainfall} mm of predicted rain, the outlet for {equipment_name} should be fully opened at a rate of 15 m³/s to allow maximum water flow and prevent flooding."
#     elif question_type == 'next_maintenance':
#         next_maintenance_date = last_service_date + timedelta(days=maintenance_interval)
#         instruction = f"When is the next maintenance due for {equipment_name} based on its last service date?"
#         response = f"Based on the last service date of {last_service_date.strftime('%Y-%m-%d')} and the standard maintenance interval of {maintenance_interval} days for {equipment_type}, the next maintenance for {equipment_name} is due on {next_maintenance_date.strftime('%Y-%m-%d')}."
#     elif question_type == 'maintenance_type':
#         instruction = f"What kind of maintenance would be needed for {equipment_name} given its current status?"
#         if maintenance_log['equipment_status'] == 'Operational':
#             response = f"Given that the {equipment_name} is currently operational, it would likely need routine maintenance to ensure continued optimal performance."
#         elif maintenance_log['equipment_status'] == 'Needs Maintenance':
#             response = f"The {equipment_name} needs maintenance. A thorough inspection and necessary repairs or part replacements should be performed."
#         else:  # 'Under Repair'
#             response = f"The {equipment_name} is currently under repair. Once repairs are complete, a full inspection and test run should be conducted before returning it to operational status."
#     else:  # lifespan
#         age = (last_service_date - installation_date).days / 365.25  # age in years
#         remaining_life = lifespan - age
#         instruction = f"Based on the lifespan of {equipment_type}, how much longer can {equipment_name} be expected to operate?"
#         if remaining_life > 0:
#             response = f"The {equipment_name} was installed on {installation_date.strftime('%Y-%m-%d')} and has been in operation for approximately {age:.1f} years. Given the typical lifespan of {lifespan} years for {equipment_type}, it can be expected to operate for about {remaining_life:.1f} more years, assuming proper maintenance. However, its performance should be closely monitored as it approaches the end of its expected lifespan."
#         else:
#             response = f"The {equipment_name} was installed on {installation_date.strftime('%Y-%m-%d')} and has been in operation for approximately {age:.1f} years, which exceeds the typical lifespan of {lifespan} years for {equipment_type}. It is recommended to plan for replacement soon, as the equipment may be operating beyond its designed lifespan and could be at higher risk of failure."

#     context = (f"On {maintenance_log['timestamp']}, technician {maintenance_log['technician']} performed a "
#                f"{maintenance_log['maintenance_type'].lower()} on {equipment_name}. "
#                f"The action taken was to {maintenance_log['action_taken'].lower()} "
#                f"Parts used included {maintenance_log['parts_used']}. "
#                f"The notes state: '{maintenance_log['notes']}' "
#                f"The equipment status was {maintenance_log['equipment_status']} with a performance metric of {maintenance_log['performance_metric']}. "
#                f"The standard maintenance interval for {equipment_type} is {maintenance_interval} days, "
#                f"and its typical lifespan is {lifespan} years. "
#                f"This {equipment_type} was installed on {installation_date.strftime('%Y-%m-%d')}.")

#     category = "closed_qa"

#     return {
#         "instruction": instruction,
#         "context": context,
#         "response": response,
#         "category": category
#     }

In [28]:
import random
from datetime import datetime, timedelta

def create_fine_tuning_entry(maintenance_log):
    '''
    This function creates a fine-tuned entry for training the Gemma LLM model.
    It incorporates equipment lifespan information for more realistic maintenance predictions.
    '''
    question_type = random.choice(['action', 'next_maintenance', 'maintenance_type', 'lifespan'])
    
    equipment_type = maintenance_log['equipment_type']
    equipment_name = maintenance_log['equipment_name']
    last_service_date = datetime.strptime(maintenance_log['timestamp'].split()[0], "%Y-%m-%d")
    installation_date = datetime.strptime(maintenance_log['installation_date'], "%Y-%m-%d")

    lifespan = equipment_types[equipment_type]['lifespan']
    maintenance_interval = equipment_types[equipment_type]['maintenance_interval']

    if question_type == 'action':
        instruction = f"What maintenance action was performed on the {equipment_name} on {last_service_date.strftime('%Y-%m-%d')}?"
        response = f"On {last_service_date.strftime('%Y-%m-%d')}, a {maintenance_log['maintenance_type'].lower()} was performed on the {equipment_name}, where {maintenance_log['action_taken'].lower()}."
    
    elif question_type == 'next_maintenance':
        next_maintenance_date = last_service_date + timedelta(days=maintenance_interval)
        instruction = f"When is the next maintenance due for {equipment_name}?"
        response = f"The next maintenance for {equipment_name} is due on {next_maintenance_date.strftime('%Y-%m-%d')} based on the last service date of {last_service_date.strftime('%Y-%m-%d')} and a maintenance interval of {maintenance_interval} days."

    elif question_type == 'maintenance_type':
        instruction = f"What kind of maintenance is needed for {equipment_name}?"
        if maintenance_log['equipment_status'] == 'Operational':
            response = f"The {equipment_name} is currently operational. Routine maintenance is recommended to ensure optimal performance."
        elif maintenance_log['equipment_status'] == 'Needs Maintenance':
            response = f"The {equipment_name} needs maintenance. A thorough inspection and necessary repairs should be performed."
        else:  # 'Under Repair'
            response = f"The {equipment_name} is under repair. Once repairs are complete, a full inspection should be conducted."

    elif question_type == 'lifespan':
        age = (last_service_date - installation_date).days / 365.25  # age in years
        remaining_life = lifespan - age
        instruction = f"How much longer can {equipment_name} be expected to operate based on its lifespan?"
        if remaining_life > 0:
            response = f"The {equipment_name} has been in operation for about {age:.1f} years. It can be expected to operate for another {remaining_life:.1f} years, assuming proper maintenance."
        else:
            response = f"The {equipment_name} has exceeded its typical lifespan of {lifespan} years. It's recommended to plan for replacement soon."

    # Generating context and conversation flow
    context = (f"Hi there! I see that maintenance was performed on the {equipment_name}. "
               f"What can I assist you with regarding its status today?\n"
               f"Hi! I’m checking on the status of the {equipment_name}. Is everything functioning properly?\n"
               f"{response}\n"
               f"Thanks! When is the next maintenance due?\n"
               f"{response}\n"
               f"What type of maintenance do we need for it?\n"
               f"{response}\n"
               f"How much longer can we expect it to operate?\n"
               f"{response}\n"
               f"If you need anything else, just ask! Have a great day!")

    category = "closed_qa"

    return {
        "instruction": instruction,
        "context": context,
        "response": response,
        "category": category
    }


In [29]:
def generate_fine_tuning_data(num_equipment, start_date, end_date, target_observations):
    '''
    This function generates fine-tuning data for the model.
    It follows the actual format that was used.
    '''
    maintenance_logs, total_observations = generate_data(num_equipment, start_date, end_date, target_observations)

    # Convert to numpy array for shuffling
    maintenance_logs_array = np.array(maintenance_logs)

    # Shuffle the array
    np.random.shuffle(maintenance_logs_array)

    # Convert back to list
    shuffled_maintenance_logs = maintenance_logs_array.tolist()

    fine_tuning_data = [create_fine_tuning_entry(log) for log in shuffled_maintenance_logs]
    return fine_tuning_data, total_observations

In [30]:
# Usage
num_equipment = 1000
start_date = datetime(1700, 1, 1)
end_date = datetime(2023, 12, 31)
target_observations = 1000000

fine_tuning_data, total_observations = generate_fine_tuning_data(num_equipment, start_date, end_date, target_observations)

print(f"Total observations generated: {total_observations}")

# Save the fine-tuning data
with open(os.path.join(folder_path, 'fine_tuning_data.json'), 'w') as f:
    json.dump(fine_tuning_data, f, indent=2)
    


print(f"Fine-tuning data saved to: {os.path.join(folder_path, 'fine_tuning_data.json')}")

Total observations generated: 645157
Fine-tuning data saved to: C:\Users\yomid\OneDrive - Case Western Reserve University\Desktop\KaggleX\Stormwater_Data_updated\fine_tuning_data.json


In [24]:
import json
import csv
import os

# Writing to JSON
with open(os.path.join(folder_path, 'fine_tuning_data.json'), 'w') as f:
    json.dump(fine_tuning_data, f, indent=2)

# Writing to CSV (assuming fine_tuning_data is a list of dictionaries)
with open(os.path.join(folder_path, 'fine_tuning_data.csv'), 'w', newline='') as f:
    if fine_tuning_data:
        writer = csv.DictWriter(f, fieldnames=fine_tuning_data[0].keys())
        writer.writeheader()
        writer.writerows(fine_tuning_data)

In [25]:
file_path = os.path.join(folder_path, 'fine_tuning_data.json')
with open(file_path, 'r') as f:
    data = json.load(f)

print(json.dumps(data[:5], indent=2))

[
  {
    "instruction": "What maintenance action was performed on the Flashlights-724675 on 1987-05-23?",
    "context": "Chatbot: Hi there! I see that maintenance was performed on the Flashlights-724675. What can I assist you with regarding its status today?\nTechnician: Hi! I\u2019m checking on the status of the Flashlights-724675. Is everything functioning properly?\nChatbot: On 1987-05-23, a monitoring was performed on the Flashlights-724675, where calibrating flow meters periodically to ensure precise water flow measurements..\nTechnician: Thanks! When is the next maintenance due?\nChatbot: On 1987-05-23, a monitoring was performed on the Flashlights-724675, where calibrating flow meters periodically to ensure precise water flow measurements..\nTechnician: What type of maintenance do we need for it?\nChatbot: On 1987-05-23, a monitoring was performed on the Flashlights-724675, where calibrating flow meters periodically to ensure precise water flow measurements..\nTechnician: How 

In [26]:
# # Alternatively, if you want to print just the structure without all the data:
print("\nStructure of a single item:")
print(json.dumps({k: type(v).__name__ for k, v in data[0].items()}, indent=2))


Structure of a single item:
{
  "instruction": "str",
  "context": "str",
  "response": "str",
  "category": "str"
}


In [None]:
pwd()