In [1]:
import pandas as pd
import re

file_path = "data"
df = pd.read_csv(file_path, sep="\t", header=None, names=["timestamp", "sensor_id", "status", "activity"], engine='python' ,on_bad_lines='skip')

# timestamp
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.dropna(subset=["timestamp"])

# Supprimer les doublons
df = df.drop_duplicates()

# Sensor_id
sensor_pattern = r"^[MD]\d{3}$"
df["sensor_valid"] = df["sensor_id"].apply(lambda x: bool(re.match(sensor_pattern, x)))

# Status
def validate_status(row):
    if row["sensor_id"].startswith("D"):
        return row["status"] in ["OPEN", "CLOSE"]
    return row["status"] in ["ON", "OFF"]

df["status_valid"] = df.apply(validate_status, axis=1)

# Activity
df["activity_valid"] = df["activity"].apply(lambda x: isinstance(x, str) or x == "")

#df = df.sort_values(by="timestamp")

df = df.reset_index(drop=True)

df.to_csv("cleaned_sensor_data.csv", index=False)

# Filtrer uniquement les lignes valides
df_valid = df[df["sensor_valid"] & df["status_valid"] & df["activity_valid"]]

# Supprimer les colonnes temporaires de validation
df_valid = df_valid.drop(["sensor_valid", "status_valid", "activity_valid"], axis=1)

print("Nettoyage terminé.")

Nettoyage terminé.


In [2]:
columns_to_drop = ["sensor_valid", "status_valid", "activity_valid"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
df.to_csv("cleaned_sensor_data.csv", index=False)

In [3]:
print(df.head(2000).to_string())

                      timestamp sensor_id status                activity
0    2009-10-16 00:01:04.000059      M017     ON                    None
1    2009-10-16 00:01:06.000046      M009     ON                    None
2    2009-10-16 00:01:07.000064      M017    OFF                    None
3    2009-10-16 00:01:08.000081      M019     ON                    None
4    2009-10-16 00:01:09.000028      M009    OFF                    None
5    2009-10-16 00:01:13.000051      M019    OFF                    None
6    2009-10-16 00:08:50.000081      M020     ON                    None
7    2009-10-16 00:08:55.000040      M020    OFF                    None
8    2009-10-16 00:24:51.000026      M020     ON                    None
9    2009-10-16 00:24:57.000084      M020    OFF                    None
10   2009-10-16 00:56:38.000060      M020     ON                    None
11   2009-10-16 00:56:41.000084      M020    OFF                    None
12   2009-10-16 01:12:27.000046      T001   20.5   

In [4]:
# Segmentation

df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

results = []

for index, row in df.iterrows():
    if isinstance(row["activity"], str) and "begin" in row["activity"]:
        activity_name = row["activity"].replace(" begin", "")

        # Trouver la ligne de fin
        end_row = df[(df["activity"] == f"{activity_name} end") & (df["timestamp"] > row["timestamp"])].head(1)

        if not end_row.empty:
            start_time = row["timestamp"]
            end_time = end_row["timestamp"].values[0]


            # Durée Activité
            duration = (end_row["timestamp"].values[0] - row["timestamp"])

            # Evénements entre "begin" et "end"
            event_window = df[(df["timestamp"] > row["timestamp"]) &
                              (df["timestamp"] < end_row["timestamp"].values[0])]

            # Evénements Actifs
            on_events = event_window[event_window["status"].isin(["ON", "OPEN"])]
            on_event_count = len(on_events)

            # Sensor Ids "ON"
            sensor_ids = on_events["sensor_id"].unique().tolist()

            results.append({
                "activity": activity_name,
                "activity_duration": duration,
                "active_events_count": on_event_count,
                "sensor_ids_on": sensor_ids
            })

print(results[:5])

[{'activity': 'Bed_to_Toilet', 'activity_duration': Timedelta('0 days 00:02:34.999922'), 'active_events_count': 6, 'sensor_ids_on': ['M028', 'M020', 'M025', 'M013']}, {'activity': 'Sleep', 'activity_duration': Timedelta('0 days 04:41:17.000007'), 'active_events_count': 106, 'sensor_ids_on': ['M020', 'M021', 'M028', 'M019', 'M009', 'M010', 'M003', 'M012', 'M022', 'M005', 'M006', 'M008', 'M013']}, {'activity': 'Morning_Meds', 'activity_duration': Timedelta('0 days 00:00:55.000004'), 'active_events_count': 11, 'sensor_ids_on': ['M023', 'M022', 'M014', 'M015', 'M016']}, {'activity': 'Watch_TV', 'activity_duration': Timedelta('0 days 00:00:30.000002'), 'active_events_count': 3, 'sensor_ids_on': ['M026', 'M008']}, {'activity': 'Kitchen_Activity', 'activity_duration': Timedelta('0 days 00:13:13.999928'), 'active_events_count': 120, 'sensor_ids_on': ['M003', 'M015', 'M023', 'M022', 'M016', 'M011', 'M017', 'M009', 'M019', 'M028', 'M025', 'M013', 'M008', 'M026', 'M007', 'M012', 'M018']}]


In [5]:
results_df = pd.DataFrame(results)

In [6]:
print(results_df.head(100).to_string())

            activity      activity_duration  active_events_count                                                                                                                                                 sensor_ids_on
0      Bed_to_Toilet 0 days 00:02:34.999922                    6                                                                                                                                      [M028, M020, M025, M013]
1              Sleep 0 days 04:41:17.000007                  106                                                                                [M020, M021, M028, M019, M009, M010, M003, M012, M022, M005, M006, M008, M013]
2       Morning_Meds 0 days 00:00:55.000004                   11                                                                                                                                [M023, M022, M014, M015, M016]
3           Watch_TV 0 days 00:00:30.000002                    3                                            

In [7]:
#Moyenne Occurence

results_df = pd.DataFrame(results)

results_df["activity_duration"] = results_df["activity_duration"].dt.total_seconds()

def filter_frequent_sensors(sensor_lists, threshold=0.8):
    sensor_counts = {}
    total_occurrences = len(sensor_lists)

    for sensor_list in sensor_lists:
        for sensor in set(sensor_list):
            sensor_counts[sensor] = sensor_counts.get(sensor, 0) + 1

    frequent_sensors = [sensor for sensor, count in sensor_counts.items() if count / total_occurrences >= threshold]

    return sorted(frequent_sensors)

summary_df2 = results_df.groupby("activity").agg(
    avg_duration=("activity_duration", "mean"),
    avg_active_events_count=("active_events_count", lambda x: round(x.mean())),
    merged_sensors=("sensor_ids_on", lambda x: filter_frequent_sensors(x, threshold=0.8))
).reset_index()

summary_df2["avg_duration"] = pd.to_timedelta(summary_df2["avg_duration"], unit='s')

summary_df2.to_csv("Solution.csv", index=False)


In [8]:
print(summary_df2.to_string())

                   activity              avg_duration  avg_active_events_count                        merged_sensors
0             Bed_to_Toilet 0 days 00:13:06.275862782                        7                                [M013]
1                    Chores 0 days 00:29:27.285718857                      163  [M003, M009, M019, M020, M025, M028]
2             Desk_Activity 0 days 00:13:46.370368648                       68                          [M007, M026]
3        Dining_Rm_Activity 0 days 00:14:18.571431333                       93                          [M003, M027]
4                  Eve_Meds 0 days 00:00:33.315775421                        5                                [D003]
5            Guest_Bathroom 0 days 00:03:46.242333794                       17                                [M018]
6          Kitchen_Activity 0 days 00:13:37.010852266                      113        [M012, M014, M015, M022, M023]
7                Leave_Home 0 days 00:25:05.611372938           

In [9]:
# Occurence Sensor%Activity
activities = [
    "Bed_to_Toilet", "Sleep", "Morning_Meds", "Watch_TV", "Kitchen_Activity",
    "Leave_Home", "Chores", "Guest_Bathroom", "Read", "Master_Bathroom",
    "Desk_Activity", "Eve_Meds", "Dining_Rm_Activity", "Meditate", "Master_Bedroom_Activity"
]

sensors = ["D001", "D002", "D003"] + [f"M{str(i).zfill(3)}" for i in range(1, 29)]

activity_sensor_table = pd.DataFrame(0, index=activities, columns=sensors)

df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

for idx, row in df.iterrows():
    if isinstance(row["activity"], str) and "begin" in row["activity"]:
        activity_name = row["activity"].replace(" begin", "")
        begin_time = row["timestamp"]

        end_df = df[(df["activity"] == f"{activity_name} end") & (df["timestamp"] > begin_time)]
        if end_df.empty:
            continue
        end_time = end_df.iloc[0]["timestamp"]

        window = df[(df["timestamp"] > begin_time) & (df["timestamp"] < end_time)]

        for _, event in window.iterrows():
            if event["status"] in ["ON", "OPEN"]:
                sensor_id = event["sensor_id"]
                if sensor_id in sensors and activity_name in activity_sensor_table.index:
                    activity_sensor_table.loc[activity_name, sensor_id] += 1



In [10]:
activity_sensor_table.to_csv("moyenne.csv", index=False)
print(activity_sensor_table.to_string())

                         D001  D002  D003  M001  M002  M003   M004  M005  M006  M007  M008  M009  M010  M011  M012  M013   M014  M015  M016  M017  M018  M019  M020  M021  M022   M023  M024  M025  M026  M027  M028
Bed_to_Toilet               0     0     0     0     0     5      0     5     4     0     4    10     3     7     2   265      0     4     5     7    27    13    39    44     3      1     0    93     3     6    72
Sleep                       0     0     1    17    15   150     93   273   167    62   201   193   105   129   107    91     47   103   141   138   123   409  1213  2396   145     89    28   160   121   235  2299
Morning_Meds                7     0    46    43    47   155    364    58    52    34   179    96    60    85   143    60    460   275    71    86   106    99   141   145   327    685     0   168    73   196   324
Watch_TV                    6     0     5   336   195   352     34    98   241   346  5495   220   194   207   228    26    168   196   156   238   

In [3]:
import pandas as pd
from collections import Counter

df["timestamp"] = pd.to_datetime(df["timestamp"])

df_annotated = df.copy()

activity_sensor = {
    "Bed_to_Toilet": ["M013"],
    "Chores": ["M025", "M028", "M027"],
    "Desk_Activity": ["M007"],
    "Dining_Rm_Activity": ["M003"],
    "Eve_Meds": ["M023", "M014"],
    "Guest_Bathroom": ["M018"],
    "Kitchen_Activity": ["M023", "M014"],
    "Leave_Home": ["M001", "M002"],
    "Master_Bathroom": ["M025"],
    "Meditate": ["M024"],
    "Watch_TV": ["M008"],
    "Sleep": ["M021", "M028"],
    "Read": ["M004"],
    "Morning_Meds": ["M023", "M014"],
    "Master_Bedroom_Activity": ["M025", "M028", "M020"]
}

localisation_dict = {
    "Master_Bedroom": ["M025", "M020", "M028", "M021"],
    "Chores": ["M025", "M028", "M027", "M020", "M021", "M026", "M023", "M008", "M003"],
    "Master_Bathroom": ["M013", "M025"],
    "Meditation_room": ["M024"],
    "Guest_Bathroom": ["M018", "M017"],
    "Hallway": ["M009", "M019"],
    "Kitchen": ["M023", "M014", "M022", "M012", "M015", "M016", "D003"],
    "Entrance": ["M001", "M002", "D001", "D002"],
    "Dining_room": ["M027", "M003"],
    "Living_room": ["M008", "M026", "M007", "M006"],
    "Reading": ["M004", "M027", "M006", "M005"]
}

def get_location(sensor_id):
    for loc, sensors in localisation_dict.items():
        if sensor_id in sensors:
            return loc
    return "Unknown"

def determine_activity(active_sensors, segment_start_timestamp):

    sensor_counts = Counter(active_sensors)
    most_common_sensors = [sensor for sensor, count in sensor_counts.most_common(3)]

    hour = segment_start_timestamp.hour
    is_night_time = (hour >= 23 or hour < 8)

    if is_night_time:
        if any(sensor in ["M021", "M028"] for sensor in active_sensors):
            return "Sleep"
        elif "M013" and "M025" in active_sensors:
            return "Bed_to_Toilet"
        elif "M025" in active_sensors:
            return "Master_Bathroom"
        return "Sleep"

    best_match, max_overlap = None, 0
    for activity, sensors in activity_sensor.items():
        overlap = len(set(most_common_sensors) & set(sensors))
        if overlap > max_overlap:
            best_match, max_overlap = activity, overlap

    if best_match is None or max_overlap == 0:
        for sensor in most_common_sensors:
            for activity, sensors in activity_sensor.items():
                if sensor in sensors:
                    best_match = activity
                    break
            if best_match is not None:
                break
        if best_match is None:
            best_match = "Chores"

    if best_match in ["Master_Bedroom_Activity", "Chores"]:
        if "M027" in most_common_sensors and "M020" not in most_common_sensors:
            best_match = "Chores"
        elif "M020" in most_common_sensors and "M027" not in most_common_sensors:
            best_match = "Master_Bedroom_Activity"
        elif "M027" in most_common_sensors and "M020" in most_common_sensors:
            best_match = "Chores" if sensor_counts["M027"] >= sensor_counts["M020"] else "Master_Bedroom_Activity"

    if best_match in ["Morning_Meds", "Eve_Meds", "Kitchen_Activity"]:
        if 7 <= hour < 10:
            best_match = "Morning_Meds"
        elif 20 <= hour < 22:
            best_match = "Eve_Meds"
        else:
            best_match = "Kitchen_Activity"

    return best_match

T_threshold = 3600

previous_timestamp = None
previous_location = None
segment_start = None
current_activity = None


for i, row in df_annotated.iterrows():
    if row["status"] not in ["ON", "OPEN"]:
        continue

    current_ts = row["timestamp"]
    current_loc = get_location(row["sensor_id"])
    time_diff = (current_ts - previous_timestamp).total_seconds() if previous_timestamp else 0

    explicit_begin = pd.notna(row["activity"]) and "begin" in row["activity"]

    is_change_point = False
    if previous_timestamp is not None:
        if time_diff > T_threshold:
            is_change_point = True
        elif previous_location is not None and current_loc != previous_location:
            is_change_point = True
        elif explicit_begin:
            is_change_point = True

    if is_change_point:
        if segment_start is not None:
            segment_rows = df_annotated.loc[segment_start:i]
            active_sensors = segment_rows[segment_rows["status"].isin(["ON", "OPEN"])]["sensor_id"].tolist()
            if active_sensors:
                segment_start_ts = pd.to_datetime(df_annotated.loc[segment_start, "timestamp"])
                detected_activity = determine_activity(active_sensors, segment_start_ts)
                df_annotated.at[segment_start, "activity"] = f"{detected_activity} begin"
                df_annotated.at[i - 1, "activity"] = f"{detected_activity} end"

        segment_start = i

    previous_timestamp = current_ts
    previous_location = current_loc

if segment_start is not None and segment_start < df_annotated.index[-1]:
    segment_rows = df_annotated.loc[segment_start:]
    active_sensors = segment_rows[segment_rows["status"].isin(["ON", "OPEN"])]["sensor_id"].tolist()
    if active_sensors:
        segment_start_ts = pd.to_datetime(df_annotated.loc[segment_start, "timestamp"])
        detected_activity = determine_activity(active_sensors, segment_start_ts)
        df_annotated.at[segment_start, "activity"] = f"{detected_activity} begin"
        df_annotated.at[df_annotated.index[-1], "activity"] = f"{detected_activity} end"

output_file = "annotated_sensor_data.csv"
df_annotated.to_csv(output_file, index=False, sep=",")
print(f"Annotation terminée et sauvegardée dans '{output_file}'")


Annotation terminée et sauvegardée dans 'annotated_sensor_data.csv'


In [12]:
print(df_annotated.head(1000).to_string())

                     timestamp sensor_id status                       activity
0   2009-10-16 00:01:04.000059      M017     ON                           None
1   2009-10-16 00:01:06.000046      M009     ON                    Sleep begin
2   2009-10-16 00:01:07.000064      M017    OFF                           None
3   2009-10-16 00:01:08.000081      M019     ON                           None
4   2009-10-16 00:01:09.000028      M009    OFF                           None
5   2009-10-16 00:01:13.000051      M019    OFF                      Sleep end
6   2009-10-16 00:08:50.000081      M020     ON                    Sleep begin
7   2009-10-16 00:08:55.000040      M020    OFF                           None
8   2009-10-16 00:24:51.000026      M020     ON                           None
9   2009-10-16 00:24:57.000084      M020    OFF                           None
10  2009-10-16 00:56:38.000060      M020     ON                           None
11  2009-10-16 00:56:41.000084      M020    OFF     

In [4]:
df_annotated["timestamp"] = pd.to_datetime(df_annotated["timestamp"])

df_annotated1 = df_annotated.copy()

current_activity = None
start_index = 0
new_index = 0
end_index = 0

current_activity = "sleeping"
start_index = 1
for i, row in df_annotated1.iterrows():
    activity_val = row["activity"]
    if "begin" in str(activity_val):

        if current_activity != activity_val.replace(" begin", "") :

            new_index = i

            if end_index > start_index:

                for j in range(start_index + 1, end_index):
                    df_annotated1.at[j, "activity"] = None

            current_activity = activity_val.replace(" begin", "")
            start_index = new_index
            new_index = 0
            end_index = 0

    elif "end" in str(activity_val):
        if current_activity == activity_val.replace(" end", ""):
            end_index = i


In [None]:

df_annotated1.to_csv("finaldata.csv", index=False, sep=",")


In [6]:
print(final_segments_df.head(100))


    segment_id                    start_time                      end_time  \
0            0 2009-10-16 00:01:04.000059000 2009-10-16 00:11:36.870083403   
1            1 2009-10-16 00:11:36.870083403 2009-10-16 00:22:09.740107806   
2            2 2009-10-16 00:22:09.740107806 2009-10-16 00:32:42.610132209   
3            3 2009-10-16 00:32:42.610132209 2009-10-16 00:43:15.480156612   
4            4 2009-10-16 00:43:15.480156612 2009-10-16 00:53:48.350181015   
..         ...                           ...                           ...   
95          95 2009-10-16 16:43:06.652377295 2009-10-16 16:53:39.522401698   
96          96 2009-10-16 16:53:39.522401698 2009-10-16 17:04:12.392426101   
97          97 2009-10-16 17:04:12.392426101 2009-10-16 17:14:45.262450504   
98          98 2009-10-16 17:14:45.262450504 2009-10-16 17:25:18.132474907   
99          99 2009-10-16 17:25:18.132474907 2009-10-16 17:35:51.002499310   

    M001  M002  M003  M004  M005  M006  M007  ...  M024  M025  

In [5]:
final_segments_df.to_csv("finaldata2.csv", index=False, sep=",")


In [None]:
import pandas as pd
import numpy as np

# Charger les données
df = pd.read_csv("finaldata.csv")

# Convertir la colonne timestamp en datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# 1. Identifier les activités et calculer leur durée
activities = []
current_activity = None
start_time = None

for idx, row in df.iterrows():
    if pd.notna(row['activity']) and 'begin' in row['activity']:
        # Début d'une nouvelle activité
        if current_activity is not None:
            duration = (row['timestamp'] - start_time).total_seconds()
            activities.append({
                'activity': current_activity.replace(' begin', ''),
                'start_time': start_time,
                'end_time': row['timestamp'],
                'duration': duration
            })

        current_activity = row['activity']
        start_time = row['timestamp']
    elif pd.notna(row['activity']) and 'end' in row['activity'] and current_activity is not None:
        # Fin de l'activité actuelle
        activity_name = current_activity.replace(' begin', '')
        end_activity_name = row['activity'].replace(' end', '')

        if activity_name == end_activity_name:
            duration = (row['timestamp'] - start_time).total_seconds()
            activities.append({
                'activity': activity_name,
                'start_time': start_time,
                'end_time': row['timestamp'],
                'duration': duration
            })
            current_activity = None
            start_time = None

# Créer un DataFrame des activités
activities_df = pd.DataFrame(activities)

if len(activities_df) == 0:
    print("Aucune activité trouvée avec des marqueurs begin/end")
else:
    # 2. Calculer la durée moyenne d'une activité (en excluant Sleep pour le calcul des segments)
    non_sleep_activities = activities_df[activities_df['activity'] != 'Sleep']
    if len(non_sleep_activities) > 0:
        mean_duration = non_sleep_activities['duration'].mean()
    else:
        mean_duration = activities_df['duration'].mean()

    print(f"Durée moyenne d'une activité (hors Sleep): {mean_duration} secondes")

    # 3. Créer des segments temporels de durée égale à la durée moyenne
    start_time = df['timestamp'].min()
    end_time = df['timestamp'].max()
    total_duration = (end_time - start_time).total_seconds()
    num_segments = int(np.ceil(total_duration / mean_duration))

    segments = []
    for i in range(num_segments):
        seg_start = start_time + pd.Timedelta(seconds=i*mean_duration)
        seg_end = start_time + pd.Timedelta(seconds=(i+1)*mean_duration)
        segments.append({
            'segment_id': i,
            'start_time': seg_start,
            'end_time': seg_end
        })

    segments_df = pd.DataFrame(segments)

    # 4. Assigner les activités et capteurs aux segments
    segment_activities = []
    sensor_list = [f"M{str(i).zfill(3)}" for i in range(1, 29)] + ["D001", "D002", "D003"]

    for _, seg in segments_df.iterrows():
        # Trouver les activités qui se chevauchent avec ce segment
        overlapping_activities = activities_df[
            (activities_df['end_time'] > seg['start_time']) &
            (activities_df['start_time'] < seg['end_time'])
        ].copy()

        # Calculer le nombre d'activités dans ce segment
        num_activities = len(overlapping_activities)
        
        # Trouver l'activité dominante (celle avec la plus grande durée dans le segment)
        if num_activities > 0:
            # Calculer la durée dans ce segment pour chaque activité
            overlap_durations = []
            for _, act in overlapping_activities.iterrows():
                start = max(act['start_time'], seg['start_time'])
                end = min(act['end_time'], seg['end_time'])
                overlap_duration = (end - start).total_seconds()
                overlap_durations.append(overlap_duration)

            overlapping_activities = overlapping_activities.assign(overlap_duration=overlap_durations)
            
            # Prioriser les activités non-Sleep
            non_sleep_activities = overlapping_activities[overlapping_activities['activity'] != 'Sleep']
            if len(non_sleep_activities) > 0:
                dominant_row = non_sleep_activities.loc[non_sleep_activities['overlap_duration'].idxmax()]
            else:
                dominant_row = overlapping_activities.loc[overlapping_activities['overlap_duration'].idxmax()]

            dominant_activity = dominant_row['activity']
        else:
            # Aucune activité dans ce segment
            dominant_activity = "No_Activity"

        # Trouver les occurrences des capteurs actifs dans ce segment
        segment_sensors = df[
            (df['timestamp'] >= seg['start_time']) &
            (df['timestamp'] < seg['end_time']) &
            (df['status'].isin(['ON', 'OPEN']))
        ]['sensor_id'].value_counts().to_dict()

        sensor_counts = {sensor: segment_sensors.get(sensor, 0) for sensor in sensor_list}

        segment_entry = {
            'segment_id': seg['segment_id'],
            'start_time': seg['start_time'],
            'end_time': seg['end_time'],
            'num_activities': num_activities,  # Nombre d'activités dans le segment
            'dominant_activity': dominant_activity,
        }
        segment_entry.update(sensor_counts)

        segment_activities.append(segment_entry)

    # Créer le DataFrame final des segments
    final_segments_df = pd.DataFrame(segment_activities)

    # Réorganiser les colonnes pour avoir:
    # - 'num_activities' en avant-dernière position
    # - 'dominant_activity' en dernière position
    cols = [col for col in final_segments_df.columns if col not in ['dominant_activity', 'num_activities']] \
           + ['num_activities', 'dominant_activity']
    final_segments_df = final_segments_df[cols]

    # 5. Résultats
    print("\nSegments temporels avec activité dominante et nombre d'activités:")
    print(final_segments_df.head())

    # Visualisation des segments
    print("\nRésumé par activité dominante:")
    print(final_segments_df['dominant_activity'].value_counts())


Durée moyenne d'une activité (hors Sleep): 203.6581392321938 secondes


In [None]:
final_segments_df.to_csv("finaldata1.csv", index=False, sep=",")
