In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
import scipy.stats as stats
from datetime import timedelta, datetime
import os
import seaborn as sns

In [2]:
DATA_PATH = '/Users/bugragorkem/Desktop/Uni/5. Semester /Data Science Projekt/archive/p_data/p_einzelnt/'
MONTHS = ["p01", "p02", "p03", "p04", "p05", "p06", "p07", "p08", "p09", "p10", "p11", "p12"]

In [3]:
def categorize_incident(description):
    if description in ["1144-Fatality", "YELLOW-Hit and Run Fatality", "1180-Trfc Collision-Major Inj"]:
        return "tote"
    elif description in ["1182-Trfc Collision-No Inj", "20002-Hit and Run No Injuries"]:
        return "keine_verletzte"
    elif description in ["20001-Hit and Run w/Injuries", "1181-Trfc Collision-Minor Inj"]:
        return "verletzte"
    else:
        return "unbekannt"

In [4]:
def find_closest_sensor(incidence_df, sensor_df, max_distance=0.3): 
    # Ensure the ABS_PM columns are sorted for faster searching
    sensor_df = sensor_df.sort_values(by=['Fwy', 'Abs PM'])

    closest_sensors = []

    for _, incident in incidence_df.iterrows():
        fwy = incident['Fwy']
        abs_pm = incident['Abs PM']

        # Filter sensors for the same freeway
        matching_sensors = sensor_df[(sensor_df['Fwy'] == fwy) & (sensor_df['Type'] == 'HOV' )]

        # Find the closest preceding sensor within the max distance
        preceding_sensors = matching_sensors[
            (matching_sensors['Abs PM'] <= abs_pm) & 
            (abs_pm - matching_sensors['Abs PM'] <= max_distance)
        ]

        if not preceding_sensors.empty:
            closest_sensor = preceding_sensors.iloc[-1]  # Last one is the closest preceding sensor
            closest_sensors.append({
                'Incident_ABS_PM': abs_pm,
                'Incident_Fwy': fwy,
                "station_id": closest_sensor['station_id'], 
                'Sensor_ABS_PM': closest_sensor['Abs PM'],
                'Sensor_Fwy': closest_sensor['Fwy'],
                'time': incident['dt'],
                'Distance': abs_pm - closest_sensor['Abs PM']
            })


    return pd.DataFrame(closest_sensors)


In [5]:
def cliffs_delta(x, y):
    """
    Berechnet Cliff's Delta als Effektstärkemaß.
    
    Parameter:
    x, y : array-like
        Die beiden Samples, zwischen denen der Effekt gemessen werden soll.
    
    Rückgabe:
    delta : float
        Der berechnete Cliff's Delta Wert.
    """
    x = np.array(x)
    y = np.array(y)
    n1 = len(x)
    n2 = len(y)
    # Vergleiche alle Paare (x, y)
    wins = np.sum(np.greater.outer(x, y))
    losses = np.sum(np.less.outer(x, y))
    delta = (wins - losses) / (n1 * n2)
    return delta

In [6]:
def datetime_to_timeslot(dt, month_start):
    """Konvertiert datetime zu Timeslot-Index relativ zum Monatsanfang"""
    delta = dt - month_start
    return int(delta.total_seconds() // 300)

In [7]:
def map_sensors_to_indices(sensor_df, monthly_station_ids):
    # Sicherstellen, dass der Datentyp von `station_id` mit `monthly_station_ids` übereinstimmt
    sensor_df['station_id'] = sensor_df['station_id'].astype(str)
    monthly_station_ids = [str(station_id) for station_id in monthly_station_ids]
    
    #Erstelle ein Mapping von `station_id` zu den entsprechenden Indizes
    sensor_to_index = {station_id: i for i, station_id in enumerate(monthly_station_ids)}
    
    # Füge eine neue Spalte mit den Indizes hinzu
    sensor_df['Index 12'] = sensor_df['station_id'].map(sensor_to_index)
    return sensor_df

In [8]:
def extract_traffic_data(npy_data, sensor_indices, time_window=1):
    extracted_data = {}
    for incident_time, sensor_index in sensor_indices:
        start_time = max(0, incident_time - time_window)  # Zeit vor dem Unfall
        end_time = incident_time + time_window            # Zeit nach dem Unfall

        # Extrahiere Daten für die jeweilige Zeitspanne
        extracted_data[(incident_time, sensor_index)] = npy_data[sensor_index, start_time:end_time]
    return extracted_data


In [9]:
def load_month_data(month):
    """Lädt Daten für einen bestimmten Monat"""
    traffic_data = np.load(os.path.join(DATA_PATH, f'{month}_merged.npy'))
    station_ids = np.load(os.path.join(DATA_PATH, f'{month}_node_order.npy'))
    return traffic_data, station_ids.astype(str)

In [12]:
# Lese Unfalldaten ein
incidents = pd.read_csv('/Users/bugragorkem/Desktop/Uni/5. Semester /Data Science Projekt/archive/incidents.csv', 
                        sep="\t", parse_dates=['dt'])
incidents['month'] = incidents['dt'].dt.to_period('M')

# Lese Sensormetadaten ein
sensor_meta = pd.read_csv('/Users/bugragorkem/Desktop/Uni/5. Semester /Data Science Projekt/archive/sensor_meta_feature.csv', 
                          sep="\t")


# Filter: Nur Unfälle und solche mit Duration >= 150 Minuten
incidents = incidents[incidents['type'] == "accident"]
incidents = incidents[incidents['Duration (mins)'] >= 15]

FileNotFoundError: [Errno 2] No such file or directory: '/Users/bugragorkem/Desktop/Uni/5. Semester /Data Science Projekt/archive/incidents.csv'

In [None]:
all_results = {}
# Gruppierung der Unfälle nach Injury-Status
for x in month:
    print(f"\n--- Analyse für Injury Status: {status} ---")
    all_data_status = []
    
    # Gruppierung nach Monat innerhalb des jeweiligen Injury-Status
    for month, month_incidents in status_incidents.groupby('month'):
        month_str = f"p{month.month:02d}"
        if month_str not in MONTHS:
            continue
        
        # Versuche, die Verkehrsdaten für den aktuellen Monat zu laden
        try:
            traffic_data = np.load(os.path.join(DATA_PATH, f'{month_str}_car.npy'))
            station_ids = np.load(os.path.join(DATA_PATH, f'{month_str}_car_node.npy')).astype(str)
            station_ids = np.delete(station_ids, 0)
        except FileNotFoundError:
            print(f"Dateien für {month_str} nicht gefunden")
            continue

        # Finde die zugehörigen Sensoren für die Unfälle im aktuellen Monat
        closest_sensors = find_closest_sensor(month_incidents, sensor_meta)
        
        # Prüfung: Falls keine Sensoren gefunden wurden, diesen Monat überspringen
        if closest_sensors.empty:
            print(f"Keine Sensoren gefunden für Monat {month_str} und Injury Status {status}")
            continue
        
        station_to_index = {sid: i for i, sid in enumerate(station_ids)}
        closest_sensors['sensor_index'] = closest_sensors['station_id'].astype(str).map(station_to_index)
        valid_sensors = closest_sensors.dropna(subset=['sensor_index']).copy()
        
        # Zeitkonvertierung: 5 Minuten vorverschieben und Timeslot relativ zum Monatsanfang berechnen
        month_start = datetime(month.year, month.month, 1)
        valid_sensors['time'] = pd.to_datetime(valid_sensors['time'], errors='coerce')
        valid_sensors['time'] = valid_sensors['time'] - pd.Timedelta(minutes=5)
        valid_sensors['timeslot'] = valid_sensors['time'].apply(lambda x: datetime_to_timeslot(x, month_start))
        
        time_window = 3  # Anzahl der 5-Minuten-Intervalle vor und nach dem Vorfall
        for _, row in valid_sensors.iterrows():
            idx = int(row['sensor_index'])
            ts = row['timeslot']
            if ts < 0 or ts >= traffic_data.shape[0]:
                continue

            start = max(0, ts - time_window)
            end = min(traffic_data.shape[0], ts + time_window + 1)  # +1, um inklusiv zu schneiden
            if (end - start) < (2 * time_window):
                continue

            traffic_slice = traffic_data[start:end, idx]
            if np.isnan(traffic_slice).any():
                continue

            all_data_status.append({
                'pre': traffic_slice[:time_window],
                'post': traffic_slice[time_window+1:],  # Zeitpunkt des Vorfalls überspringen
                'feature_names': ['Traffic Volume', 'Occupancy Rate', 'Speed']
            })
            print(f"{month_str} für Injury Status {status} verarbeitet")
            
    if len(all_data_status) == 0:
        print(f"Keine gültigen Daten für Injury Status {status}")
        continue

    # Aggregiere die Pre- und Post-Daten
    pre_data = []
    post_data = []
    for entry in all_data_status:
        pre_data.extend(entry['pre'][:])
        post_data.extend(entry['post'][:])
    
    # Optional: Entferne die letzten drei Elemente aus pre_data (wie im Originalcode)
    if len(pre_data) > 1:
        pre_data.pop()
        pre_data.pop()
        pre_data.pop()
    
    min_length = min(len(pre_data), len(post_data))
    if min_length == 0:
        print(f"Nicht genügend Daten für Injury Status {status}")
        continue

    # Wilcoxon-Test (alternative='greater')
    stat, p_value = stats.wilcoxon(pre_data[:min_length], post_data[:min_length], alternative='greater')
    # Berechnung von Cliff's Delta
    delta = cliffs_delta(pre_data[:min_length], post_data[:min_length])
    
    all_results[status] = {
        'statistic': stat,
        'p_value': p_value,
        'median_pre': np.median(pre_data[:min_length]),
        'median_post': np.median(post_data[:min_length]),
        'cliffs_delta': delta
    }
    
    # Konvertiere zu numpy-Arrays und kappe Extremwerte (z. B. 99%-Quantil)
    pre_data = np.array(pre_data[:min_length])
    post_data = np.array(post_data[:min_length])
    pre_data = np.clip(pre_data, 0, np.percentile(pre_data, 99))
    post_data = np.clip(post_data, 0, np.percentile(post_data, 99))
    
    # --- Boxplot Visualization ---
    plt.figure(figsize=(8, 6))
    plt.ylim(0, max(pre_data.max(), post_data.max()) * 1.1)
    sns.boxplot(data=[pre_data, post_data], palette=["#4c72b0", "#dd8452"])
    plt.xticks([0, 1], ['Vor Unfall', 'Nach Unfall'])
    plt.title("")
    plt.ylabel('Verkehrsaufkommen')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()
    
    # --- Violinplot Visualization ---
    data_plot = pd.DataFrame({
        'Traffic Volume': np.concatenate([pre_data, post_data]),
        'Period': ['Vor Unfall'] * len(pre_data) + ['Nach Unfall'] * len(post_data)
    })
    plt.figure(figsize=(10, 6))
    sns.violinplot(x='Period', y='Traffic Volume', data=data_plot, palette=["#4c72b0", "#dd8452"])
    plt.title(f'')
    plt.xlabel('')
    plt.ylabel('Verkehrsaufkommen')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()
    
    
    n_pre = len(pre_data[:min_length])
    n_post = len(post_data[:min_length])
    print(f"Anzahl der Datenpunkte für {status}: Pre = {n_pre}, Post = {n_post}")

print("Analyse Ergebnisse:")
print(all_results)

SyntaxError: invalid syntax (4116078327.py, line 3)

In [None]:
results = {}

for feature_idx in range(1):  # Für alle 3 Features, hier nur Feature 0 (Traffic Volume)
    pre_data = []
    post_data = []

    for entry in all_data:
        pre_data.extend(entry['pre'][:])
        post_data.extend(entry['post'][:])

    # Entferne das letzte Element von pre_data, falls es nicht leer ist
    if len(pre_data) > 1:
        pre_data.pop()
        pre_data.pop()
        pre_data.pop()

    print(f"Feature {feature_idx}: pre_data Länge = {len(pre_data)}, post_data Länge = {len(post_data)}")
    print(f"pre_data: {pre_data[:10]}")
    print(f"post_data: {post_data[:10]}")
    print(f"Einzigartige Werte in pre_data: {len(set(pre_data))}")
    print(f"Einzigartige Werte in post_data: {len(set(post_data))}")

    # Sicherstellen, dass beide Listen gleich lang sind
    min_length = min(len(pre_data), len(post_data))

    if min_length > 0:  # Verhindere Fehler durch leere Listen
        stat, p_value = stats.wilcoxon(pre_data[:min_length], post_data[:min_length], alternative='greater')

        feature_name = entry['feature_names'][feature_idx]  # Feature-Namen korrekt referenzieren
        results[feature_name] = {
            'statistic': stat,
            'p_value': p_value,
            'median_pre': np.median(pre_data[:min_length]),
            'median_post': np.median(post_data[:min_length])
        }

        # Konvertiere die Daten in numpy-Arrays
        pre_data = np.array(pre_data[:min_length])
        post_data = np.array(post_data[:min_length])

        # Optional: Extremwerte kappen (z.B. auf das 99%-Quantil)
        pre_data = np.clip(pre_data, 0, np.percentile(pre_data, 99))
        post_data = np.clip(post_data, 0, np.percentile(post_data, 99))

        # --- Boxplot Visualization ---
        plt.figure(figsize=(8, 6))
        plt.ylim(0, 750)  # oder ein anderes sinnvolles Limit
        sns.boxplot(data=[pre_data, post_data], palette=["#4c72b0", "#dd8452"])
        plt.xticks([0, 1], ['Pre-Incident', 'Post-Incident'])
        plt.title(f"Traffic Volume\np-value: {p_value:.4f}")
        plt.ylabel('Traffic Volume (Normalized)')
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.show()

        # --- Violinplot Visualization ---
        # Daten in einen DataFrame packen
        data = pd.DataFrame({
            'Traffic Volume': np.concatenate([pre_data, post_data]),
            'Period': ['Pre-Incident'] * len(pre_data) + ['Post-Incident'] * len(post_data)
        })

        plt.figure(figsize=(10, 6))
        sns.violinplot(x='Period', y='Traffic Volume', data=data, palette=["#4c72b0", "#dd8452"])
        plt.title('Verteilung des Traffic Volume vor und nach dem Unfall')
        plt.xlabel('')
        plt.ylabel('Traffic Volume (Normalized)')
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.show()

plt.tight_layout()
plt.plot

# Ausgabe der statistischen Ergebnisse
print("Statistische Ergebnisse:")
for feature, res in results.items():
    print(f"\n{feature}:")
    print(f"  Wilcoxon-Statistik: {res['statistic']}")
    print(f"  P-Wert: {res['p_value']:.4f}")
    print(f"  Median vorher: {res['median_pre']:.2f}")
    print(f"  Median nachher: {res['median_post']:.2f}")
    print(f"  Signifikant (p < 0.05): {'Ja' if res['p_value'] < 0.05 else 'Nein'}")

print("Statistische Ergebnisse:")
for feature, res in results.items():
    print(f"\n{feature}:")
    print(f"  Wilcoxon-Statistik: {res['statistic']}")
    print(f"  P-Wert: {res['p_value']:.4f}")
    print(f"  Median vorher: {res['median_pre']:.2f}")
    print(f"  Median nachher: {res['median_post']:.2f}")
    print(f"  Signifikant (p < 0.05): {'Ja' if res['p_value'] < 0.05 else 'Nein'}")