In [5]:
import pandas as pd
import re

file_path = "data"
df = pd.read_csv(file_path, sep="\t", header=None, names=["timestamp", "sensor_id", "status", "activity"], engine='python' ,on_bad_lines='skip')

# timestamp
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.dropna(subset=["timestamp"])

# Supprimer les doublons
df = df.drop_duplicates()

# Sensor_id
sensor_pattern = r"^[MD]\d{3}$"
df["sensor_valid"] = df["sensor_id"].apply(lambda x: bool(re.match(sensor_pattern, x)))

# Status
def validate_status(row):
    if row["sensor_id"].startswith("D"):
        return row["status"] in ["OPEN", "CLOSE"]
    return row["status"] in ["ON", "OFF"]

df["status_valid"] = df.apply(validate_status, axis=1)

# Activity
df["activity_valid"] = df["activity"].apply(lambda x: isinstance(x, str) or x == "")

#df = df.sort_values(by="timestamp")

df = df.reset_index(drop=True)

df.to_csv("cleaned_sensor_data.csv", index=False)

# Filtrer uniquement les lignes valides
df_valid = df[df["sensor_valid"] & df["status_valid"] & df["activity_valid"]]

# Supprimer les colonnes temporaires de validation
df_valid = df_valid.drop(["sensor_valid", "status_valid", "activity_valid"], axis=1)

print("Nettoyage terminé.")

Nettoyage terminé.


In [None]:
columns_to_drop = ["sensor_valid", "status_valid", "activity_valid"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
df.to_csv("cleaned_sensor_data.csv", index=False)

#fin nettoyage

In [7]:
print(df.head(2000).to_string())

                      timestamp sensor_id status                activity
0    2009-10-16 00:01:04.000059      M017     ON                    None
1    2009-10-16 00:01:06.000046      M009     ON                    None
2    2009-10-16 00:01:07.000064      M017    OFF                    None
3    2009-10-16 00:01:08.000081      M019     ON                    None
4    2009-10-16 00:01:09.000028      M009    OFF                    None
5    2009-10-16 00:01:13.000051      M019    OFF                    None
6    2009-10-16 00:08:50.000081      M020     ON                    None
7    2009-10-16 00:08:55.000040      M020    OFF                    None
8    2009-10-16 00:24:51.000026      M020     ON                    None
9    2009-10-16 00:24:57.000084      M020    OFF                    None
10   2009-10-16 00:56:38.000060      M020     ON                    None
11   2009-10-16 00:56:41.000084      M020    OFF                    None
12   2009-10-16 01:12:27.000046      T001   20.5   

In [8]:

#labelisation

import pandas as pd
from collections import Counter

df["timestamp"] = pd.to_datetime(df["timestamp"])

df_annotated = df.copy()

activity_sensor = {
    "Bed_to_Toilet": ["M013"],
    "Chores": ["M025", "M028", "M027"],
    "Desk_Activity": ["M007"],
    "Dining_Rm_Activity": ["M003"],
    "Eve_Meds": ["M023", "M014"],
    "Guest_Bathroom": ["M018"],
    "Kitchen_Activity": ["M023", "M014"],
    "Leave_Home": ["M001", "M002"],
    "Master_Bathroom": ["M025"],
    "Meditate": ["M024"],
    "Watch_TV": ["M008"],
    "Sleep": ["M021", "M028"],
    "Read": ["M004"],
    "Morning_Meds": ["M023", "M014"],
    "Master_Bedroom_Activity": ["M025", "M028", "M020"]
}

localisation_dict = {
    "Master_Bedroom": ["M025", "M020", "M028", "M021"],
    "Chores": ["M025", "M028", "M027", "M020", "M021", "M026", "M023", "M008", "M003"],
    "Master_Bathroom": ["M013", "M025"],
    "Meditation_room": ["M024"],
    "Guest_Bathroom": ["M018", "M017"],
    "Hallway": ["M009", "M019"],
    "Kitchen": ["M023", "M014", "M022", "M012", "M015", "M016", "D003"],
    "Entrance": ["M001", "M002", "D001", "D002"],
    "Dining_room": ["M027", "M003"],
    "Living_room": ["M008", "M026", "M007", "M006"],
    "Reading": ["M004", "M027", "M006", "M005"]
}

def get_location(sensor_id):
    for loc, sensors in localisation_dict.items():
        if sensor_id in sensors:
            return loc
    return "Unknown"

def determine_activity(active_sensors, segment_start_timestamp):

    sensor_counts = Counter(active_sensors)
    most_common_sensors = [sensor for sensor, count in sensor_counts.most_common(3)]

    hour = segment_start_timestamp.hour
    is_night_time = (hour >= 23 or hour < 8)

    if is_night_time:
        if any(sensor in ["M021", "M028"] for sensor in active_sensors):
            return "Sleep"
        elif "M013" and "M025" in active_sensors:
            return "Bed_to_Toilet"
        elif "M025" in active_sensors:
            return "Master_Bathroom"
        return "Sleep"

    best_match, max_overlap = None, 0
    for activity, sensors in activity_sensor.items():
        overlap = len(set(most_common_sensors) & set(sensors))
        if overlap > max_overlap:
            best_match, max_overlap = activity, overlap

    if best_match is None or max_overlap == 0:
        for sensor in most_common_sensors:
            for activity, sensors in activity_sensor.items():
                if sensor in sensors:
                    best_match = activity
                    break
            if best_match is not None:
                break
        if best_match is None:
            best_match = "Chores"

    if best_match in ["Master_Bedroom_Activity", "Chores"]:
        if "M027" in most_common_sensors and "M020" not in most_common_sensors:
            best_match = "Chores"
        elif "M020" in most_common_sensors and "M027" not in most_common_sensors:
            best_match = "Master_Bedroom_Activity"
        elif "M027" in most_common_sensors and "M020" in most_common_sensors:
            best_match = "Chores" if sensor_counts["M027"] >= sensor_counts["M020"] else "Master_Bedroom_Activity"

    if best_match in ["Morning_Meds", "Eve_Meds", "Kitchen_Activity"]:
        if 7 <= hour < 10:
            best_match = "Morning_Meds"
        elif 20 <= hour < 22:
            best_match = "Eve_Meds"
        else:
            best_match = "Kitchen_Activity"

    return best_match

T_threshold = 3600

previous_timestamp = None
previous_location = None
segment_start = None
current_activity = None


for i, row in df_annotated.iterrows():
    if row["status"] not in ["ON", "OPEN"]:
        continue

    current_ts = row["timestamp"]
    current_loc = get_location(row["sensor_id"])
    time_diff = (current_ts - previous_timestamp).total_seconds() if previous_timestamp else 0

    explicit_begin = pd.notna(row["activity"]) and "begin" in row["activity"]

    is_change_point = False
    if previous_timestamp is not None:
        if time_diff > T_threshold:
            is_change_point = True
        elif previous_location is not None and current_loc != previous_location:
            is_change_point = True
        elif explicit_begin:
            is_change_point = True

    if is_change_point:
        if segment_start is not None:
            segment_rows = df_annotated.loc[segment_start:i]
            active_sensors = segment_rows[segment_rows["status"].isin(["ON", "OPEN"])]["sensor_id"].tolist()
            if active_sensors:
                segment_start_ts = pd.to_datetime(df_annotated.loc[segment_start, "timestamp"])
                detected_activity = determine_activity(active_sensors, segment_start_ts)
                df_annotated.at[segment_start, "activity"] = f"{detected_activity} begin"
                df_annotated.at[i - 1, "activity"] = f"{detected_activity} end"

        segment_start = i

    previous_timestamp = current_ts
    previous_location = current_loc

if segment_start is not None and segment_start < df_annotated.index[-1]:
    segment_rows = df_annotated.loc[segment_start:]
    active_sensors = segment_rows[segment_rows["status"].isin(["ON", "OPEN"])]["sensor_id"].tolist()
    if active_sensors:
        segment_start_ts = pd.to_datetime(df_annotated.loc[segment_start, "timestamp"])
        detected_activity = determine_activity(active_sensors, segment_start_ts)
        df_annotated.at[segment_start, "activity"] = f"{detected_activity} begin"
        df_annotated.at[df_annotated.index[-1], "activity"] = f"{detected_activity} end"


        



In [9]:
df_annotated["timestamp"] = pd.to_datetime(df_annotated["timestamp"])

df_annotated1 = df_annotated.copy()

current_activity = None
start_index = 0
new_index = 0
end_index = 0

current_activity = "sleeping"
start_index = 1
for i, row in df_annotated1.iterrows():
    activity_val = row["activity"]
    if "begin" in str(activity_val):

        if current_activity != activity_val.replace(" begin", "") :

            new_index = i

            if end_index > start_index:

                for j in range(start_index + 1, end_index):
                    df_annotated1.at[j, "activity"] = None

            current_activity = activity_val.replace(" begin", "")
            start_index = new_index
            new_index = 0
            end_index = 0

    elif "end" in str(activity_val):
        if current_activity == activity_val.replace(" end", ""):
            end_index = i


In [10]:
output_file = "annotated_sensor_data.csv"
df_annotated1.to_csv(output_file, index=False, sep=",")
print(f"Annotation terminée et sauvegardée dans '{output_file}'")


Annotation terminée et sauvegardée dans 'annotated_sensor_data.csv'


In [11]:
print(df_annotated1.head(1000).to_string())

                     timestamp sensor_id status                       activity
0   2009-10-16 00:01:04.000059      M017     ON                           None
1   2009-10-16 00:01:06.000046      M009     ON                    Sleep begin
2   2009-10-16 00:01:07.000064      M017    OFF                           None
3   2009-10-16 00:01:08.000081      M019     ON                           None
4   2009-10-16 00:01:09.000028      M009    OFF                           None
5   2009-10-16 00:01:13.000051      M019    OFF                           None
6   2009-10-16 00:08:50.000081      M020     ON                           None
7   2009-10-16 00:08:55.000040      M020    OFF                           None
8   2009-10-16 00:24:51.000026      M020     ON                           None
9   2009-10-16 00:24:57.000084      M020    OFF                           None
10  2009-10-16 00:56:38.000060      M020     ON                           None
11  2009-10-16 00:56:41.000084      M020    OFF     