In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("Project2_Dataset_Corrected.csv")  # Update the path if needed

# Crash Node
crash_df = df[['ID', 'Crash ID', 'Crash Type', 'Month', 'Year', 'Number Fatalities']].drop_duplicates()
crash_df.columns = ['crash_id', 'crash_code', 'crash_type', 'month', 'year', 'num_fatalities']
crash_df.to_csv("nodes_crash.csv", index=False)

# Date Node
date_df = df[['Dayweek', 'Day of week', 'Christmas Period', 'Easter Period']].drop_duplicates().reset_index(drop=True)
date_df['date_id'] = ['d' + str(i+1) for i in range(len(date_df))]
date_df.to_csv("nodes_date.csv", index=False)

# Time Node
time_df = df[['Time', 'Time of day']].drop_duplicates().reset_index(drop=True)
time_df['time_id'] = ['t' + str(i+1) for i in range(len(time_df))]
time_df.to_csv("nodes_time.csv", index=False)

# Derived Vehicle Node
def determine_vehicle_type(row):
    bus = row['Bus Involvement'] == 'Yes'
    heavy = row['Heavy Rigid Truck Involvement'] == 'Yes'
    articulated = row['Articulated Truck Involvement'] == 'Yes'
    if bus and articulated:
        return "bus and articulated truck involved"
    elif bus and heavy:
        return "bus and heavy rigid truck involved"
    elif articulated and heavy:
        return "articulated and heavy rigid truck involved"
    elif bus:
        return "only bus involved"
    elif heavy:
        return "only heavy rigid involved"
    elif articulated:
        return "only articulated truck involved"
    else:
        return "no heavy vehicles involved"

df['vehicle_type'] = df.apply(determine_vehicle_type, axis=1)
vehicle_df = df[['vehicle_type']].drop_duplicates().reset_index(drop=True)
vehicle_df['vehicle_id'] = ['v' + str(i+1) for i in range(len(vehicle_df))]
vehicle_df.to_csv("nodes_vehicle.csv", index=False)

# Person Node
person_df = df[['Road User', 'Gender', 'Age', 'Age Group']].drop_duplicates().reset_index(drop=True)
person_df['person_id'] = ['p' + str(i+1) for i in range(len(person_df))]
person_df.to_csv("nodes_person.csv", index=False)

# Location Node
location_df = df[['State', 'National LGA Name 2024', 'SA4 Name 2021',
                  'National Remoteness Areas', 'National Road Type']].drop_duplicates().reset_index(drop=True)
location_df['location_id'] = ['l' + str(i+1) for i in range(len(location_df))]
location_df.to_csv("nodes_location.csv", index=False)

# Speed Zone Node
speed_df = df[['Speed Limit']].drop_duplicates().reset_index(drop=True)
speed_df['speed_id'] = ['s' + str(i+1) for i in range(len(speed_df))]
speed_df.to_csv("nodes_speed.csv", index=False)

# Relationships
rel_date = df.merge(date_df, on=['Dayweek', 'Day of week', 'Christmas Period', 'Easter Period'])
rel_date_df = rel_date[['ID', 'date_id']].rename(columns={'ID': 'crash_id'})
rel_date_df.to_csv("rel_happened_on.csv", index=False)

rel_time = df.merge(time_df, on=['Time', 'Time of day'])
rel_time_df = rel_time[['ID', 'time_id']].rename(columns={'ID': 'crash_id'})
rel_time_df.to_csv("rel_occurred_at.csv", index=False)

rel_vehicle = df.merge(vehicle_df, on='vehicle_type')
rel_vehicle_df = rel_vehicle[['ID', 'vehicle_id']].rename(columns={'ID': 'crash_id'})
rel_vehicle_df.to_csv("rel_vehicle_used.csv", index=False)

rel_person = df.merge(person_df, on=['Road User', 'Gender', 'Age', 'Age Group'])
rel_person_df = rel_person[['ID', 'person_id']].rename(columns={'ID': 'crash_id'})
rel_person_df.to_csv("rel_involved.csv", index=False)

rel_location = df.merge(location_df, on=['State', 'National LGA Name 2024',
                                         'SA4 Name 2021', 'National Remoteness Areas',
                                         'National Road Type'])
rel_location_df = rel_location[['ID', 'location_id']].rename(columns={'ID': 'crash_id'})
rel_location_df.to_csv("rel_located_in.csv", index=False)

rel_speed = df.merge(speed_df, on='Speed Limit')
rel_speed_df = rel_speed[['ID', 'speed_id']].rename(columns={'ID': 'crash_id'})
rel_speed_df.to_csv("rel_speed_zone.csv", index=False)
