In [None]:
# prompt: want to download dataset from kaggle and unizip it , api command - kaggle datasets download -d dartweichen/student-life
# , my api credendtials - {"username":"abhaysingla","key":"d25c8dcf58344bbdba3f136ec819c0b3"}

!pip install kaggle
!mkdir -p ~/.kaggle
!echo '{"username":"abhaysingla","key":"d25c8dcf58344bbdba3f136ec819c0b3"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d dartweichen/student-life
!unzip student-life.zip


Dataset URL: https://www.kaggle.com/datasets/dartweichen/student-life
License(s): unknown
Downloading student-life.zip to /content
 97% 377M/390M [00:04<00:00, 89.5MB/s]
100% 390M/390M [00:04<00:00, 95.1MB/s]
Archive:  student-life.zip
  inflating: dataset/EMA/EMA_definition.json  
  inflating: dataset/EMA/response/Activity/Activity_u00.json  
  inflating: dataset/EMA/response/Activity/Activity_u01.json  
  inflating: dataset/EMA/response/Activity/Activity_u02.json  
  inflating: dataset/EMA/response/Activity/Activity_u03.json  
  inflating: dataset/EMA/response/Activity/Activity_u04.json  
  inflating: dataset/EMA/response/Activity/Activity_u05.json  
  inflating: dataset/EMA/response/Activity/Activity_u07.json  
  inflating: dataset/EMA/response/Activity/Activity_u08.json  
  inflating: dataset/EMA/response/Activity/Activity_u09.json  
  inflating: dataset/EMA/response/Activity/Activity_u10.json  
  inflating: dataset/EMA/response/Activity/Activity_u12.json  
  inflating: dataset/EMA

In [None]:
!pip install pandas numpy matplotlib



In [None]:
import pandas as pd
import os

# Path to the directory containing user files
data_dir = '/content/dataset/sensing/activity'

# List to store the preprocessed dataframes for each user
preprocessed_dataframes = []

# Loop through all users from u00 to u59
for user_id in range(60):
    # Construct the filename for the user's activity data
    filename = os.path.join(data_dir, f'activity_u{user_id:02d}.csv')

    # Check if the file exists
    if os.path.exists(filename):
        # Load the user's activity data
        activity_data = pd.read_csv(filename)

        # Sort the data by timestamp
        activity_data = activity_data.sort_values(by='timestamp')
        activity_data.columns = activity_data.columns.str.strip()

        # Merge adjacent or overlapping events
        activity_data['activity'] = activity_data['activity inference'].apply(lambda x: 'Unknown' if x == 3 else x)
        activity_data['activity_merged'] = activity_data['activity'].ne(activity_data['activity'].shift()).cumsum()
        activity_data_merged = activity_data.groupby(['activity_merged']).agg(
            start_timestamp=('timestamp', 'first'),
            end_timestamp=('timestamp', 'last'),
            activity=('activity', 'first')
        )

        # Map activity inference IDs to descriptions
        activity_data_merged['activity'] = activity_data_merged['activity'].map({
            0: 'Stationary',
            1: 'Walking',
            2: 'Running',
            3: 'Unknown'
        })

        # Add a column for user ID
        activity_data_merged['user_id'] = user_id

        # Add the preprocessed data to the list
        preprocessed_dataframes.append(activity_data_merged)


# Concatenate all preprocessed dataframes into a single dataframe
if preprocessed_dataframes:
    all_users_data = pd.concat(preprocessed_dataframes)

    # Convert timestamp to datetime format
    all_users_data.loc[:, 'start_timestamp'] = pd.to_datetime(all_users_data['start_timestamp'], unit='s')
    all_users_data.loc[:, 'end_timestamp'] = pd.to_datetime(all_users_data['end_timestamp'], unit='s')

    # Display the preprocessed data for all users
    print(all_users_data.head())

    # Check for missing values
    missing_values = all_users_data.isnull().sum()
    print("Missing Values:\n", missing_values)

    # Check for duplicates
    duplicates = all_users_data.duplicated().sum()
    print("\nDuplicates:", duplicates)

    # Remove NaN values
    all_users_data.dropna(inplace=True)

    # Drop duplicates
    all_users_data.drop_duplicates(inplace=True)

    # Display the preprocessed data for all users
    print("\nCleaned Data:\n", all_users_data.head())
else:
    print("No data found for any user.")

# Calculate duration of each activity per day
all_users_data['duration'] = all_users_data['end_timestamp'] - all_users_data['start_timestamp']

# Group by user_id, activity, and date, then sum the durations
activity_duration_per_day = all_users_data.groupby(['user_id', 'activity', all_users_data['start_timestamp'].dt.date])['duration'].sum().reset_index()

# Pivot the table to have one row per user per day with different activity durations as columns
activity_pivot = activity_duration_per_day.pivot_table(index=['user_id', 'start_timestamp'], columns='activity', values='duration', fill_value=pd.Timedelta(0)).reset_index()

# Ensure there are no multi-level columns
activity_pivot.columns = ['user_id', 'date'] + [f"{col}_duration" for col in activity_pivot.columns[2:]]

# Display the result
print(activity_pivot.head())
activity_pivot.to_csv("activity_data_simplified.csv", index=False)


                    start_timestamp       end_timestamp    activity  user_id
activity_merged                                                             
1               2013-03-27 04:00:01 2013-03-27 06:16:01  Stationary        0
2               2013-03-27 06:16:04 2013-03-27 06:16:06         NaN        0
3               2013-03-27 06:16:09 2013-03-27 06:19:43  Stationary        0
4               2013-03-27 06:19:45 2013-03-27 06:19:53         NaN        0
5               2013-03-27 06:19:55 2013-03-27 06:20:06  Stationary        0
Missing Values:
 start_timestamp        0
end_timestamp          0
activity           97721
user_id                0
dtype: int64

Duplicates: 135

Cleaned Data:
                     start_timestamp       end_timestamp    activity  user_id
activity_merged                                                             
1               2013-03-27 04:00:01 2013-03-27 06:16:01  Stationary        0
3               2013-03-27 06:16:09 2013-03-27 06:19:43  Stationary

In [None]:
import pandas as pd
import os

# Path to the directory containing user files
data_dir = '/content/dataset/sensing/audio'

# List to store the preprocessed dataframes for each user
preprocessed_dataframes = []

# Loop through all users from u00 to u59
for user_id in range(60):
    # Construct the filename for the user's audio data
    filename = os.path.join(data_dir, f'audio_u{user_id:02d}.csv')

    # Check if the file exists
    if os.path.exists(filename):
        # Load the user's audio data
        audio_data = pd.read_csv(filename)

        # Sort the data by timestamp
        audio_data = audio_data.sort_values(by='timestamp')
        audio_data.columns = audio_data.columns.str.strip()

        # Merge adjacent or overlapping events
        audio_data['sound_type'] = audio_data['audio inference'].apply(lambda x: 'Unknown' if x == 3 else x)
        audio_data['sound_type_merged'] = audio_data['sound_type'].ne(audio_data['sound_type'].shift()).cumsum()
        audio_data_merged = audio_data.groupby(['sound_type_merged']).agg(
            start_timestamp=('timestamp', 'first'),
            end_timestamp=('timestamp', 'last'),
            sound_type=('sound_type', 'first')
        )

        # Map audio inference IDs to descriptions
        audio_data_merged['sound_type'] = audio_data_merged['sound_type'].map({
            0: 'Silence',
            1: 'Voice',
            2: 'Noise',
            3: 'Unknown'
        })

        # Add a column for user ID
        audio_data_merged['user_id'] = user_id

        # Add the preprocessed data to the list
        preprocessed_dataframes.append(audio_data_merged)

# Concatenate all preprocessed dataframes into a single dataframe
if preprocessed_dataframes:
    all_users_audio_data = pd.concat(preprocessed_dataframes)

    # Convert timestamp to datetime format
    all_users_audio_data['start_timestamp'] = pd.to_datetime(all_users_audio_data['start_timestamp'], unit='s')
    all_users_audio_data['end_timestamp'] = pd.to_datetime(all_users_audio_data['end_timestamp'], unit='s')

    # Display the preprocessed data for all users
    print(all_users_audio_data.head())

    # Check for missing values
    missing_values = all_users_audio_data.isnull().sum()
    print("Missing Values:\n", missing_values)

    # Check for duplicates
    duplicates = all_users_audio_data.duplicated().sum()
    print("\nDuplicates:", duplicates)

    # Remove NaN values
    all_users_audio_data.dropna(inplace=True)

    # Drop duplicates
    all_users_audio_data.drop_duplicates(inplace=True)

    # Display the preprocessed data for all users
    print("\nCleaned Data:\n", all_users_audio_data.head())
else:
    print("No data found for any user.")

# Calculate duration of each sound type per day
all_users_audio_data['duration'] = all_users_audio_data['end_timestamp'] - all_users_audio_data['start_timestamp']

# Group by user_id, sound_type, and date, then sum the durations
all_users_audio_data['date'] = all_users_audio_data['start_timestamp'].dt.date
audio_duration_per_day = all_users_audio_data.groupby(['user_id', 'sound_type', 'date'])['duration'].sum().reset_index()

# Pivot the table to have one row per user per day with different sound type durations as columns
audio_pivot = audio_duration_per_day.pivot_table(index=['user_id', 'date'], columns='sound_type', values='duration', fill_value=pd.Timedelta(0)).reset_index()

# Ensure there are no multi-level columns
audio_pivot.columns = ['user_id', 'date'] + [f"{col}_duration" for col in audio_pivot.columns[2:]]

# Display the result
print(audio_pivot.head())
audio_pivot.to_csv("audio_data_simplified.csv", index=False)


                      start_timestamp       end_timestamp sound_type  user_id
sound_type_merged                                                            
1                 2013-03-27 04:02:21 2013-03-27 04:03:20    Silence        0
2                 2013-03-27 04:06:21 2013-03-27 04:06:22      Noise        0
3                 2013-03-27 04:06:23 2013-03-27 04:06:33    Silence        0
4                 2013-03-27 04:06:34 2013-03-27 04:06:34      Noise        0
5                 2013-03-27 04:06:35 2013-03-27 04:06:42    Silence        0
Missing Values:
 start_timestamp    0
end_timestamp      0
sound_type         0
user_id            0
dtype: int64

Duplicates: 6509

Cleaned Data:
                       start_timestamp       end_timestamp sound_type  user_id
sound_type_merged                                                            
1                 2013-03-27 04:02:21 2013-03-27 04:03:20    Silence        0
2                 2013-03-27 04:06:21 2013-03-27 04:06:22      Noise    

In [None]:
import pandas as pd
import os

# Paths to the directories containing user files for phone lock and phone charge data
lock_data_dir = '/content/dataset/sensing/phonelock'
charge_data_dir = '/content/dataset/sensing/phonecharge'

# Lists to store the preprocessed dataframes for each user
lock_dataframes = []
charge_dataframes = []

# Process phone lock data
for user_id in range(60):
    lock_filename = os.path.join(lock_data_dir, f'phonelock_u{user_id:02d}.csv')

    if os.path.exists(lock_filename):
        lock_data = pd.read_csv(lock_filename)
        lock_data = lock_data.rename(columns={'start': 'start_timestamp'})
        lock_data['start_timestamp'] = pd.to_datetime(lock_data['start_timestamp'], unit='s')
        lock_data['end'] = pd.to_datetime(lock_data['end'], unit='s')
        lock_data['user_id'] = user_id
        lock_dataframes.append(lock_data)

# Concatenate all phone lock dataframes into a single dataframe
if lock_dataframes:
    all_users_lock_data = pd.concat(lock_dataframes)
    all_users_lock_data['modality'] = 'locked'
    all_users_lock_data['duration'] = all_users_lock_data['end'] - all_users_lock_data['start_timestamp']
    all_users_lock_data['date'] = all_users_lock_data['start_timestamp'].dt.date
    lock_duration_per_day = all_users_lock_data.groupby(['user_id', 'date', 'modality'])['duration'].sum().reset_index()
else:
    print("No phone lock data found for any user.")

# Process phone charge data
for user_id in range(60):
    charge_filename = os.path.join(charge_data_dir, f'phonecharge_u{user_id:02d}.csv')

    if os.path.exists(charge_filename):
        charge_data = pd.read_csv(charge_filename)
        charge_data = charge_data.rename(columns={'start': 'start_timestamp'})
        charge_data['start_timestamp'] = pd.to_datetime(charge_data['start_timestamp'], unit='s')
        charge_data['end'] = pd.to_datetime(charge_data['end'], unit='s')
        charge_data['user_id'] = user_id
        charge_dataframes.append(charge_data)

# Concatenate all phone charge dataframes into a single dataframe
if charge_dataframes:
    all_users_charge_data = pd.concat(charge_dataframes)
    all_users_charge_data['charge_status'] = 'charging'
    all_users_charge_data['duration'] = all_users_charge_data['end'] - all_users_charge_data['start_timestamp']
    all_users_charge_data['date'] = all_users_charge_data['start_timestamp'].dt.date
    charge_duration_per_day = all_users_charge_data.groupby(['user_id', 'date', 'charge_status'])['duration'].sum().reset_index()
else:
    print("No phone charge data found for any user.")

# Display and save phone lock data
if lock_duration_per_day is not None:
    print("Phone Lock Data:\n", lock_duration_per_day.head())
    lock_duration_per_day.to_csv("phone_lock_data.csv", index=False)

# Display and save phone charge data
if charge_duration_per_day is not None:
    print("Phone Charge Data:\n", charge_duration_per_day.head())
    charge_duration_per_day.to_csv("phone_charge_data.csv", index=False)

# Merge phone lock and charge data into a single dataframe
if lock_duration_per_day is not None and charge_duration_per_day is not None:
    combined_data = pd.merge(lock_duration_per_day, charge_duration_per_day, on=['user_id', 'date'], how='outer', suffixes=('_lock', '_charge'))
    print("Combined Data:\n", combined_data.head())
    combined_data.to_csv("combined_modality_charge_data.csv", index=False)
else:
    print("One of the datasets is missing, cannot combine.")


Phone Lock Data:
    user_id        date modality        duration
0        0  2013-03-27   locked 0 days 09:50:42
1        0  2013-03-28   locked 0 days 01:21:24
2        0  2013-03-29   locked 0 days 11:14:43
3        0  2013-03-30   locked 0 days 07:23:12
4        0  2013-03-31   locked 0 days 20:26:39
Phone Charge Data:
    user_id        date charge_status        duration
0        0  2013-03-27      charging 0 days 06:16:11
1        0  2013-03-28      charging 0 days 03:06:48
2        0  2013-03-29      charging 0 days 05:38:52
3        0  2013-03-30      charging 0 days 02:10:04
4        0  2013-03-31      charging 0 days 10:03:30
Combined Data:
    user_id        date modality   duration_lock charge_status duration_charge
0        0  2013-03-27   locked 0 days 09:50:42      charging 0 days 06:16:11
1        0  2013-03-28   locked 0 days 01:21:24      charging 0 days 03:06:48
2        0  2013-03-29   locked 0 days 11:14:43      charging 0 days 05:38:52
3        0  2013-03-30   loc

In [None]:
import pandas as pd
import os

# Path to the directory containing light files
light_data_dir = '/content/dataset/sensing/dark'

# List to store the light dataframes for each user
light_dataframes = []

# Loop through all users from u00 to u59
for user_id in range(60):
    # Construct the filename for the user's light data
    light_filename = os.path.join(light_data_dir, f'dark_u{user_id:02d}.csv')

    # Check if the file exists
    if os.path.exists(light_filename):
        # Load the user's light data
        light_data = pd.read_csv(light_filename)

        # Rename columns to match the expected column names
        light_data = light_data.rename(columns={'start': 'start_timestamp', 'end': 'end_timestamp'})

        # Convert timestamp to datetime format
        light_data['start_timestamp'] = pd.to_datetime(light_data['start_timestamp'], unit='s')
        light_data['end_timestamp'] = pd.to_datetime(light_data['end_timestamp'], unit='s')

        # Add a column for user ID
        light_data['user_id'] = user_id

        # Add the preprocessed data to the list
        light_dataframes.append(light_data)

# Concatenate all light dataframes into a single dataframe
if light_dataframes:
    all_users_light_data = pd.concat(light_dataframes)

    # Assume all records in 'dark' directory indicate dark environment
    all_users_light_data['light_condition'] = 'dark'

    # Calculate duration
    all_users_light_data['duration'] = all_users_light_data['end_timestamp'] - all_users_light_data['start_timestamp']
    all_users_light_data['date'] = all_users_light_data['start_timestamp'].dt.date

    # Group by user_id and date, then sum the durations
    dark_duration_per_day = all_users_light_data.groupby(['user_id', 'date'])['duration'].sum().reset_index()
    dark_duration_per_day['light_condition'] = 'dark'

    # Display the result
    print(dark_duration_per_day.head())
    dark_duration_per_day.to_csv("dark_duration_data.csv", index=False)
else:
    print("No data found for any user.")


   user_id        date        duration light_condition
0        0  2013-03-27 0 days 09:16:32            dark
1        0  2013-03-28 0 days 01:22:23            dark
2        0  2013-03-29 0 days 13:39:12            dark
3        0  2013-03-30 0 days 02:32:21            dark
4        0  2013-03-31 0 days 08:39:45            dark


In [None]:
import pandas as pd
import os

# Path to the directory containing conversation files
conversation_data_dir = '/content/dataset/sensing/conversation'

# List to store the conversation dataframes for each user
conversation_dataframes = []

# Loop through all users from u00 to u59
for user_id in range(60):
    # Construct the filename for the user's conversation data
    conversation_filename = os.path.join(conversation_data_dir, f'conversation_u{user_id:02d}.csv')

    # Check if the file exists
    if os.path.exists(conversation_filename):
        # Load the user's conversation data
        conversation_data = pd.read_csv(conversation_filename)

        # Rename columns to match the expected column names
        conversation_data = conversation_data.rename(columns={' end_timestamp': 'end_timestamp'})

        # Convert timestamp to datetime format
        conversation_data['start_timestamp'] = pd.to_datetime(conversation_data['start_timestamp'], unit='s')
        conversation_data['end_timestamp'] = pd.to_datetime(conversation_data['end_timestamp'], unit='s')

        # Add a column for user ID
        conversation_data['user_id'] = user_id

        # Add the preprocessed data to the list
        conversation_dataframes.append(conversation_data)

# Concatenate all conversation dataframes into a single dataframe
if conversation_dataframes:
    all_users_conversation_data = pd.concat(conversation_dataframes)

    # Calculate duration of each conversation
    all_users_conversation_data['duration'] = all_users_conversation_data['end_timestamp'] - all_users_conversation_data['start_timestamp']
    all_users_conversation_data['date'] = all_users_conversation_data['start_timestamp'].dt.date

    # Group by user_id and date, then sum the durations
    conversation_duration_per_day = all_users_conversation_data.groupby(['user_id', 'date'])['duration'].sum().reset_index()
    conversation_duration_per_day['activity'] = 'conversation'

    # Display the result
    print(conversation_duration_per_day.head())
    conversation_duration_per_day.to_csv("conversation_duration_per_day.csv", index=False)
else:
    print("No data found for any user.")


   user_id        date        duration      activity
0        0  2013-03-27 0 days 06:22:51  conversation
1        0  2013-03-28 0 days 05:48:22  conversation
2        0  2013-03-29 0 days 07:59:07  conversation
3        0  2013-03-30 0 days 08:48:02  conversation
4        0  2013-03-31 0 days 09:57:41  conversation


In [None]:
import json
import os
import csv
from datetime import datetime, timedelta

data_path = '/content/dataset/EMA/response/Sleep/'
output_file = '/content/sleep_data_per_day.csv'

# Create a CSV file and write the header
with open(output_file, 'w', newline='') as csvfile:
    fieldnames = ['user_id', 'date', 'hours_slept', "sleep_quality"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Iterate through each user's sleep data
    for file_name in os.listdir(data_path):
        if file_name.endswith('.json'):
            with open(os.path.join(data_path, file_name), 'r') as file:
                try:
                    user_data = json.load(file)
                    user_id = file_name.split('.')[0]  # Extract the user ID from the file name
                    user_id = user_id.strip("Sleep_u")  # Clean up user ID

                    # Iterate through each entry in the user's sleep data
                    for entry in user_data:
                        if 'hour' in entry and 'rate' in entry and 'resp_time' in entry:
                            try:
                                hours_slept = float(entry['hour'])
                                sleep_quality_rate = int(entry['rate'])
                                timestamp = int(entry['resp_time'])
                                date = datetime.utcfromtimestamp(timestamp).date()  # Convert timestamp to date
                                previous_night = date - timedelta(days=1)  # Get the date of the previous night

                                # Write the data for the previous night
                                writer.writerow({
                                    'user_id': user_id,
                                    'date': previous_night,
                                    'hours_slept': hours_slept,
                                    'sleep_quality': sleep_quality_rate
                                })
                            except (ValueError, KeyError, TypeError) as e:
                                print(f"Skipping invalid entry in {file_name}: {entry}, error: {e}")
                except json.JSONDecodeError as e:
                    print(f"Skipping invalid JSON file: {file_name}, error: {e}")

print(f"Sleep data extraction complete. Output saved to {output_file}")


Sleep data extraction complete. Output saved to /content/sleep_data_per_day.csv


In [None]:
sleep_data_per_day = pd.read_csv("/content/sleep_data_per_day.csv")

In [None]:
# user_id,date,hours_slept,sleep_quality_rate
sleep_data_per_day["user_id"].value_counts()


user_id
59    73
16    65
57    61
58    58
44    58
19    55
0     55
32    47
10    47
33    41
8     41
49    41
17    40
4     34
30    34
51    34
18    33
36    32
53    31
14    30
3     28
1     27
12    26
2     26
35    25
43    25
7     24
23    23
52    23
24    23
46    23
56    21
22    20
15    19
27    16
25    15
41    14
42    14
54    12
20    12
31    11
45    11
39    10
47     9
34     7
50     6
5      6
9      3
13     1
Name: count, dtype: int64

In [None]:
import pandas as pd
import os

# Define the paths for each CSV file
sleep_data_file = '/content/sleep_data_per_day.csv'
activity_data_file = 'activity_data_simplified.csv'
audio_data_file = 'audio_data_simplified.csv'
lock_data_file = 'phone_lock_data.csv'
charge_data_file = 'phone_charge_data.csv'
dark_data_file = 'dark_duration_data.csv'
conversation_data_file = 'conversation_duration_per_day.csv'

# Read each CSV file into a dataframe
sleep_data_per_day = pd.read_csv(sleep_data_file)
activity_pivot = pd.read_csv(activity_data_file)
audio_pivot = pd.read_csv(audio_data_file)
lock_duration_per_day = pd.read_csv(lock_data_file)
charge_duration_per_day = pd.read_csv(charge_data_file)
dark_duration_per_day = pd.read_csv(dark_data_file)
conversation_duration_per_day = pd.read_csv(conversation_data_file)

# Rename 'duration' columns to make them unique before merging
lock_duration_per_day.rename(columns={'duration': 'lock_duration'}, inplace=True)
charge_duration_per_day.rename(columns={'duration': 'charge_duration'}, inplace=True)
dark_duration_per_day.rename(columns={'duration': 'dark_duration'}, inplace=True)
conversation_duration_per_day.rename(columns={'duration': 'conversation_duration'}, inplace=True)
# Merge datasets on user_id and date
merged_df = sleep_data_per_day.merge(conversation_duration_per_day, on=['user_id', 'date'], how='outer')
merged_df = merged_df.merge(dark_duration_per_day, on=['user_id', 'date'], how='outer')
merged_df = merged_df.merge(audio_pivot, on=['user_id', 'date'], how='outer')
merged_df = merged_df.merge(lock_duration_per_day, on=['user_id', 'date'], how='outer')
merged_df = merged_df.merge(charge_duration_per_day, on=['user_id', 'date'], how='outer')
merged_df = merged_df.merge(activity_pivot, on=['user_id', 'date'], how='outer')

# Fill NaN values with 0 for numerical columns and 'unknown' for categorical columns
for col in merged_df.columns:
    if 'duration' in col or 'hours_slept' in col or 'sleep_quality' in col:
        merged_df[col].fillna(0, inplace=True)
    else:
        merged_df[col].fillna('unknown', inplace=True)

# Remove any duplicate columns
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]

# Save the merged dataframe to a CSV file
merged_df.to_csv("merged_user_data.csv", index=False)

# Display the result
print(merged_df.head())


   user_id        date  hours_slept  sleep_quality conversation_duration  \
0        1  2013-04-02          6.0            2.0       0 days 03:21:49   
1        1  2013-03-26          7.0            2.0                     0   
2        1  2013-03-26          7.0            2.0                     0   
3        1  2013-03-31          7.0            2.0       0 days 06:29:37   
4        1  2013-03-27         10.0            1.0       0 days 03:23:21   

       activity    dark_duration light_condition   Noise_duration  \
0  conversation                0         unknown  0 days 01:48:04   
1       unknown                0         unknown                0   
2       unknown                0         unknown                0   
3  conversation  0 days 10:56:55            dark  0 days 02:14:33   
4  conversation  0 days 15:30:14            dark  0 days 02:48:31   

  Silence_duration   Voice_duration modality    lock_duration charge_status  \
0  0 days 16:21:13  0 days 02:03:22   locked  0 d

In [None]:
import pandas as pd

# Load the merged user data
data = pd.read_csv('merged_user_data.csv')

# Convert timedelta columns to total seconds
time_columns = ['conversation_duration', 'dark_duration', 'Noise_duration', 'Silence_duration', 'Voice_duration', 'lock_duration', 'charge_duration', 'Running_duration', 'Stationary_duration', 'Walking_duration']
for col in time_columns:
    data[col] = pd.to_timedelta(data[col]).dt.total_seconds()

# Display the processed data
print(data.head())


   user_id        date  hours_slept  sleep_quality  conversation_duration  \
0        1  2013-04-02          6.0            2.0                12109.0   
1        1  2013-03-26          7.0            2.0                    0.0   
2        1  2013-03-26          7.0            2.0                    0.0   
3        1  2013-03-31          7.0            2.0                23377.0   
4        1  2013-03-27         10.0            1.0                12201.0   

       activity  dark_duration light_condition  Noise_duration  \
0  conversation            0.0         unknown          6484.0   
1       unknown            0.0         unknown             0.0   
2       unknown            0.0         unknown             0.0   
3  conversation        39415.0            dark          8073.0   
4  conversation        55814.0            dark         10111.0   

   Silence_duration  Voice_duration modality  lock_duration charge_status  \
0           58873.0          7402.0   locked        33144.0    

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Define features and target
features = ['hours_slept', 'conversation_duration', 'dark_duration', 'Noise_duration', 'Silence_duration', 'Voice_duration', 'lock_duration', 'charge_duration', 'Running_duration', 'Stationary_duration', 'Walking_duration']
target = 'sleep_quality'

# Prepare to store results
results = []

# Unique users
unique_users = data['user_id'].unique()

for user in unique_users:
    # Split the data
    train_data = data[data['user_id'] != user]
    test_data = data[data['user_id'] == user]

    # Train and test sets
    X_train = train_data[features]
    y_train = train_data[target]
    X_test = test_data[features]
    y_test = test_data[target]

    # Train the model
    model = DecisionTreeRegressor(random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate the mean squared error
    mse = mean_squared_error(y_test, y_pred)

    # Store the results
    results.append({
        'user_id': user,
        'mse': mse,
        'y_test': y_test.values,
        'y_pred': y_pred
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display the results
print(results_df)


    user_id       mse                                             y_test  \
0         1  0.167879  [2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, ...   
1        39  0.075566  [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...   
2        42  0.517857  [1.0, 2.0, 2.0, 3.0, 1.0, 1.0, 3.0, 2.0, 1.0, ...   
3        47  0.056604  [1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, ...   
4        51  0.403043  [2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, ...   
5        20  0.269914  [1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 4.0, 2.0, ...   
6         7  0.400323  [2.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 2.0, 1.0, ...   
7        34  0.156863  [1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 0.0, 0.0, ...   
8         8  1.054054  [2.0, 2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 2.0, ...   
9         2  0.195618  [2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, ...   
10       22  0.896684  [1.0, 1.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, 1.0, ...   
11       15  0.520576  [2.0, 1.0, 1.0, 2.0, 2.0, 3.0, 2.0, 2.0, 1.0, ...   
12       31 

In [None]:
from sklearn.tree import export_text

# Function to extract decision path for a single prediction
def get_decision_path(model, X):
    tree_rules = export_text(model, feature_names=features)
    decision_path = []
    for i, feature in enumerate(features):
        if X[feature] > 0:
            decision_path.append(feature)
    return decision_path

# Add decision paths to the results
decision_paths = []
for user in unique_users:
    test_data = data[data['user_id'] == user]
    X_test = test_data[features]

    for i, row in X_test.iterrows():
        decision_path = get_decision_path(model, row)
        decision_paths.append({
            'user_id': user,
            'decision_path': decision_path
        })

# Convert decision paths to a DataFrame
decision_paths_df = pd.DataFrame(decision_paths)

# Merge decision paths with the results
results_df = pd.merge(results_df, decision_paths_df, on='user_id')

# Display the results with decision paths
results_df


Unnamed: 0,user_id,mse,y_test,y_pred,decision_path
0,1,0.167879,"[2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, ...","[2.0, 1.8, 1.8, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, ...","[hours_slept, conversation_duration, Noise_dur..."
1,1,0.167879,"[2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, ...","[2.0, 1.8, 1.8, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, ...",[hours_slept]
2,1,0.167879,"[2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, ...","[2.0, 1.8, 1.8, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, ...",[hours_slept]
3,1,0.167879,"[2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, ...","[2.0, 1.8, 1.8, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, ...","[hours_slept, conversation_duration, dark_dura..."
4,1,0.167879,"[2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, ...","[2.0, 1.8, 1.8, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, ...","[hours_slept, conversation_duration, dark_dura..."
...,...,...,...,...,...
3114,43,0.394019,"[3.0, 3.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, ...","[1.0, 1.0, 1.8125, 2.0, 1.0, 1.0, 2.0, 2.0, 2....","[conversation_duration, dark_duration, Noise_d..."
3115,43,0.394019,"[3.0, 3.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, ...","[1.0, 1.0, 1.8125, 2.0, 1.0, 1.0, 2.0, 2.0, 2....","[conversation_duration, dark_duration, Noise_d..."
3116,43,0.394019,"[3.0, 3.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, ...","[1.0, 1.0, 1.8125, 2.0, 1.0, 1.0, 2.0, 2.0, 2....","[conversation_duration, dark_duration, Noise_d..."
3117,43,0.394019,"[3.0, 3.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, ...","[1.0, 1.0, 1.8125, 2.0, 1.0, 1.0, 2.0, 2.0, 2....","[dark_duration, lock_duration, charge_duration..."


In [None]:
model.score(X_test, y_test)

0.7089828242854307

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import _tree

# Function to extract detailed decision paths
def extract_detailed_decision_paths(tree, feature_names, X):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []

    def recurse(node, path, depth):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            path_left = f"{name} <= {threshold:.2f}"
            path_right = f"{name} > {threshold:.2f}"
            recurse(tree_.children_left[node], path + [path_left], depth + 1)
            recurse(tree_.children_right[node], path + [path_right], depth + 1)
        else:
            paths.append(path)

    for sample_id in range(X.shape[0]):
        node_indicator = tree.decision_path(X[sample_id].reshape(1, -1))
        leaf_id = tree.apply(X[sample_id].reshape(1, -1))
        path = []
        for node_id in node_indicator.indices[node_indicator.indptr[0]:node_indicator.indptr[1]]:
            if leaf_id[0] == node_id:
                continue
            threshold_sign = "<=" if X[sample_id, tree_.feature[node_id]] <= tree_.threshold[node_id] else ">"
            path.append(f"{feature_names[tree_.feature[node_id]]} {threshold_sign} {tree_.threshold[node_id]:.2f}")
        paths.append(path)

    return paths

# Example usage with a trained model
# Assuming 'model' is your trained DecisionTreeRegressor and 'X_test' is your test set
detailed_paths = extract_detailed_decision_paths(model, features, X_test.values)
print(detailed_paths)

detailed_paths

[['hours_slept > 0.50', 'hours_slept > 6.50', 'hours_slept <= 7.50', 'conversation_duration > 2368.00', 'Voice_duration <= 20515.00', 'Voice_duration <= 14457.50', 'Silence_duration <= 72777.50', 'Silence_duration <= 65816.00', 'Stationary_duration > 26544.00', 'Walking_duration > 1361.50', 'Stationary_duration <= 126680.00', 'Stationary_duration <= 124546.50', 'Silence_duration > 16886.00', 'conversation_duration > 15240.50', 'conversation_duration > 16175.00', 'Walking_duration <= 6571.50', 'Noise_duration > 8774.00', 'Noise_duration > 9364.50', 'Stationary_duration > 37225.50', 'dark_duration > 39469.50', 'dark_duration <= 51074.00', 'Silence_duration > 43458.50'], ['hours_slept > 0.50', 'hours_slept > 6.50', 'hours_slept <= 7.50', 'conversation_duration > 2368.00', 'Voice_duration <= 20515.00', 'Voice_duration <= 14457.50', 'Silence_duration <= 72777.50', 'Silence_duration <= 65816.00', 'Stationary_duration > 26544.00', 'Walking_duration > 1361.50', 'Stationary_duration <= 126680.0



[['hours_slept > 0.50',
  'hours_slept > 6.50',
  'hours_slept <= 7.50',
  'conversation_duration > 2368.00',
  'Voice_duration <= 20515.00',
  'Voice_duration <= 14457.50',
  'Silence_duration <= 72777.50',
  'Silence_duration <= 65816.00',
  'Stationary_duration > 26544.00',
  'Walking_duration > 1361.50',
  'Stationary_duration <= 126680.00',
  'Stationary_duration <= 124546.50',
  'Silence_duration > 16886.00',
  'conversation_duration > 15240.50',
  'conversation_duration > 16175.00',
  'Walking_duration <= 6571.50',
  'Noise_duration > 8774.00',
  'Noise_duration > 9364.50',
  'Stationary_duration > 37225.50',
  'dark_duration > 39469.50',
  'dark_duration <= 51074.00',
  'Silence_duration > 43458.50'],
 ['hours_slept > 0.50',
  'hours_slept > 6.50',
  'hours_slept <= 7.50',
  'conversation_duration > 2368.00',
  'Voice_duration <= 20515.00',
  'Voice_duration <= 14457.50',
  'Silence_duration <= 72777.50',
  'Silence_duration <= 65816.00',
  'Stationary_duration > 26544.00',
  '

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import _tree

# Function to extract detailed decision paths with context
def extract_detailed_decision_paths_with_context(tree, feature_names, X):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []

    def recurse(node, path, depth):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            path_left = f"{name} <= {threshold:.2f}"
            path_right = f"{name} > {threshold:.2f}"
            recurse(tree_.children_left[node], path + [path_left], depth + 1)
            recurse(tree_.children_right[node], path + [path_right], depth + 1)
        else:
            paths.append(path)

    for sample_id in range(X.shape[0]):
        node_indicator = tree.decision_path(X[sample_id].reshape(1, -1))
        leaf_id = tree.apply(X[sample_id].reshape(1, -1))
        path = []
        for node_id in node_indicator.indices[node_indicator.indptr[0]:node_indicator.indptr[1]]:
            if leaf_id[0] == node_id:
                continue
            threshold_sign = "<=" if X[sample_id, tree_.feature[node_id]] <= tree_.threshold[node_id] else ">"
            context = ""
            if feature_names[tree_.feature[node_id]] == 'hours_slept':
                context = " (Previous day sleep duration)"
            elif feature_names[tree_.feature[node_id]] == 'Noise_duration':
                context = " (Duration of noise)"
            elif feature_names[tree_.feature[node_id]] == 'dark_duration':
                context = " (Time spent in a dark room)"
            path.append(f"{feature_names[tree_.feature[node_id]]} {threshold_sign} {tree_.threshold[node_id]:.2f}{context}")
        paths.append(path)

    return paths

# Example usage with a trained model
detailed_paths_with_context = extract_detailed_decision_paths_with_context(model, features, X_test.values)
print(detailed_paths_with_context)
detailed_paths_with_context

[['hours_slept > 0.50 (Previous day sleep duration)', 'hours_slept > 6.50 (Previous day sleep duration)', 'hours_slept <= 7.50 (Previous day sleep duration)', 'conversation_duration > 2368.00', 'Voice_duration <= 20515.00', 'Voice_duration <= 14457.50', 'Silence_duration <= 72777.50', 'Silence_duration <= 65816.00', 'Stationary_duration > 26544.00', 'Walking_duration > 1361.50', 'Stationary_duration <= 126680.00', 'Stationary_duration <= 124546.50', 'Silence_duration > 16886.00', 'conversation_duration > 15240.50', 'conversation_duration > 16175.00', 'Walking_duration <= 6571.50', 'Noise_duration > 8774.00 (Duration of noise)', 'Noise_duration > 9364.50 (Duration of noise)', 'Stationary_duration > 37225.50', 'dark_duration > 39469.50 (Time spent in a dark room)', 'dark_duration <= 51074.00 (Time spent in a dark room)', 'Silence_duration > 43458.50'], ['hours_slept > 0.50 (Previous day sleep duration)', 'hours_slept > 6.50 (Previous day sleep duration)', 'hours_slept <= 7.50 (Previous d



[['hours_slept > 0.50 (Previous day sleep duration)',
  'hours_slept > 6.50 (Previous day sleep duration)',
  'hours_slept <= 7.50 (Previous day sleep duration)',
  'conversation_duration > 2368.00',
  'Voice_duration <= 20515.00',
  'Voice_duration <= 14457.50',
  'Silence_duration <= 72777.50',
  'Silence_duration <= 65816.00',
  'Stationary_duration > 26544.00',
  'Walking_duration > 1361.50',
  'Stationary_duration <= 126680.00',
  'Stationary_duration <= 124546.50',
  'Silence_duration > 16886.00',
  'conversation_duration > 15240.50',
  'conversation_duration > 16175.00',
  'Walking_duration <= 6571.50',
  'Noise_duration > 8774.00 (Duration of noise)',
  'Noise_duration > 9364.50 (Duration of noise)',
  'Stationary_duration > 37225.50',
  'dark_duration > 39469.50 (Time spent in a dark room)',
  'dark_duration <= 51074.00 (Time spent in a dark room)',
  'Silence_duration > 43458.50'],
 ['hours_slept > 0.50 (Previous day sleep duration)',
  'hours_slept > 6.50 (Previous day sleep

In [None]:
# Function to map model predictions to qualitative sleep quality levels
def map_to_qualitative_level(predictions):
    qualitative_levels = []
    for pred in predictions:
        if pred < 1.5:
            qualitative_levels.append(1)  # Very good sleep quality
        elif pred < 2.5:
            qualitative_levels.append(2)  # Fairly good sleep quality
        elif pred < 3.5:
            qualitative_levels.append(3)  # Fairly bad sleep quality
        else:
            qualitative_levels.append(4)  # Very bad sleep quality
    return qualitative_levels

# Apply the mapping to model predictions
y_pred_qualitative = map_to_qualitative_level(model.predict(X_test))



In [None]:
pip install google-generativeai



In [None]:
# AIzaSyCNE4SQ5JtJ6q3Nue4-gcdGH8U1ksAbFgE

import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

genai.configure(api_key="AIzaSyCNE4SQ5JtJ6q3Nue4-gcdGH8U1ksAbFgE")
gemini = genai.GenerativeModel('gemini-pro')


# Function to convert decision path to human-readable format
def format_decision_path(decision_path):
    formatted_path = []
    for condition in decision_path:
        # Convert conditions to human-readable format
        formatted_condition = condition.replace('_', ' ').capitalize()
        formatted_path.append(formatted_condition)
    return ' -> '.join(formatted_path)

# Function to generate personalized suggestions using GPT
def generate_suggestions(formatted_path, forecast):
    forecast_str = ["Very good sleep quality", "Fairly good sleep quality", "Fairly bad sleep quality", "Very bad sleep quality"][forecast - 1]
    prompt = f"""
    We used a decision tree to forecast the sleep quality of a person based on their past behavior. The resulting decision path is the following:
    Decision path from the root to the decision leaf: {" ".join(formatted_path)}
    The resulting sleep forecast is: {forecast_str}
    Considering the decision path and the decision tree prediction, please provide personalized natural language suggestions for the person to improve sleep quality.
    """

    response = gemini.generate_content(prompt)
    return response.text

path = ['hours_slept <= 0.50 (Previous day sleep duration)',
            'Silence_duration > 0.50',
            'Running_duration > 995.00',
            'Running_duration > 996.50',
            'charge_duration <= 33509.50',
            'Walking_duration > 1758.50']
to_markdown(generate_suggestions(path, int(model.predict([X_test.iloc[0, :]])[0])))

# # Generate suggestions for each user
# results_df['formatted_decision_path'] = results_df['decision_path'].apply(format_decision_path)
# results_df['suggestions'] = results_df['formatted_decision_path'].apply(generate_suggestions)

# # Display the results with suggestions
# print(results_df[['user_id', 'mse', 'suggestions']])



> **Personalized suggestions for improving sleep quality based on the decision tree prediction and decision path:**
> 
> * Aim for at least 7-9 hours of sleep each night. Your decision path indicates that you may be getting less than 5 hours of sleep. Aiming for more sleep could improve its quality.
> * **Ensure a quiet sleeping environment**. Your decision tree suggests that having a quiet environment may contribute to a better sleep quality. Try using earplugs or a white noise machine to block out noise.
> * **Incorporate more running into your daily routine**. Your decision tree indicates that running may enhance sleep quality. Aim for at least 30 minutes of moderate-intensity running most days of the week.
> * **Consider reducing your screen time before bed**. Your decision tree suggests that using electronic devices close to bedtime may interfere with sleep quality. Try to avoid using screens for at least an hour before you go to bed.
> * **Maintain a consistent sleep-wake cycle**. Go to bed and wake up around the same time every day, even on weekends. This helps to regulate your body's natural sleep-wake cycle.

In [None]:
X_test.iloc[0, :]

hours_slept                  7.0
conversation_duration    25753.0
dark_duration            46941.0
Noise_duration           13788.0
Silence_duration         51332.0
Voice_duration            6496.0
lock_duration            47984.0
charge_duration          40507.0
Running_duration            24.0
Stationary_duration      77812.0
Walking_duration          4304.0
Name: 1365, dtype: float64

In [None]:
model.predict([X_test.iloc[0, :].values])[0]



1.0

In [None]:
import pickle

with open("model.pkl", 'wb') as m:
  pickle.dump(model, m)

AttributeError: module 'sklearn' has no attribute '_version_'