In [75]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

In [76]:
df = pd.read_csv("dataset_mood_smartphone.csv")
df['time'] = pd.to_datetime(df['time'])
df['date'] = df['time'].dt.date
df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)

df

Unnamed: 0,index,id,time,variable,value,date
0,1,AS14.01,2014-02-26 13:00:00.000,mood,6.000,2014-02-26
1,2,AS14.01,2014-02-26 15:00:00.000,mood,6.000,2014-02-26
2,3,AS14.01,2014-02-26 18:00:00.000,mood,6.000,2014-02-26
3,4,AS14.01,2014-02-26 21:00:00.000,mood,7.000,2014-02-26
4,5,AS14.01,2014-02-27 09:00:00.000,mood,6.000,2014-02-27
...,...,...,...,...,...,...
376907,2770399,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032,2014-04-11
376908,2772465,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008,2014-04-19
376909,2774026,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026,2014-04-26
376910,2774133,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033,2014-04-27


In [77]:
df_filtered = df[df['variable'].isin(['sms', 'call'])]

# Aggregate counts by individual and event type
counts = df_filtered.groupby(['id', 'variable'])['value'].size().reset_index(name='count')

# Determine the IQR for outlier detection
Q1 = counts['count'].quantile(0.25)
Q3 = counts['count'].quantile(0.75)
IQR = Q3 - Q1
outliers_threshold_low = Q1 - 1.5 * IQR
outliers_threshold_high = Q3 + 1.5 * IQR

# Identify outliers
outliers = counts[(counts['count'] < outliers_threshold_low) | (counts['count'] > outliers_threshold_high)]

In [78]:
# Filter for 'sms' and 'call' events
df_sms_call = df[df['variable'].isin(['sms', 'call'])]

# Group by individual ('id'), variable, and date to count daily events
daily_counts = df_sms_call.groupby(['id', 'variable', df_sms_call['time'].dt.date]).size().reset_index(name='daily_count')


In [79]:
# Calculate IQR for daily counts
Q1 = daily_counts['daily_count'].quantile(0.25)
Q3 = daily_counts['daily_count'].quantile(0.75)
IQR = Q3 - Q1

# Define outliers as those beyond 1.5 times the IQR from the quartiles
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outlier rows based on daily counts
outliers = daily_counts[(daily_counts['daily_count'] < lower_bound) | (daily_counts['daily_count'] > upper_bound)]


In [80]:
nan_rows_arousal = df[(df['variable'] == 'circumplex.arousal') & (df['value'].isna())].index
nan_rows_valence = df[(df['variable'] == 'circumplex.valence') & (df['value'].isna())].index
nan_rows_activity = df[(df['variable'] == 'activity') & (df['value'].isna())].index


# Combine the indices of rows with NaN values for arousal and valence
nan_rows_combined = nan_rows_arousal.union(nan_rows_valence)

In [81]:
# List of variables/categories to check for negative values, excluding mood, arousal, and valence
variables_to_check = [variable for variable in df['variable'].unique() if variable not in ['mood', 'circumplex.arousal', 'circumplex.valence']]

# Check for negative values in the remaining variables
negative_values_check = {variable: (df[df['variable'] == variable]['value'] < 0).any() for variable in variables_to_check}

negative_values_check

# Identify rows with negative values in appCat.builtin and appCat.entertainment in the original dataset
negative_values_builtin = df[(df['variable'] == 'appCat.builtin') & (df['value'] < 0)].index
negative_values_entertainment = df[(df['variable'] == 'appCat.entertainment') & (df['value'] < 0)].index

#neg combined 
neg = negative_values_builtin.union(negative_values_entertainment)
# Combine the indices of rows with negative values for appCat.builtin and appCat.entertainment
# with previously identified NaN rows for removal
remove_combined = nan_rows_combined.union(negative_values_builtin).union(negative_values_entertainment)

df_negative = df.loc[neg]

In [82]:
combined = df.loc[remove_combined]
combined
df1 = df.drop(combined.index)

In [83]:
# Initialize a DataFrame to store outliers
outliers_df = pd.DataFrame()

variables_for_box_plots_all = [
    'screen', 'appCat.builtin', 'appCat.communication', 'appCat.entertainment',
    'appCat.finance', 'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social',
    'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather'
]
# Iterate over each variable to remove outliers, saving them first
for variable in variables_for_box_plots_all:
    # Isolate the current variable's data
    var_df = df1[df1['variable'] == variable]
    
    # Calculate IQR and determine bounds for outliers
    Q1 = var_df['value'].quantile(0.25)
    Q3 = var_df['value'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identifying outliers
    outliers = var_df[(var_df['value'] < lower_bound) | (var_df['value'] > upper_bound)]
    
    # Append outliers to the outliers_df DataFrame
    outliers_df = pd.concat([outliers_df, outliers], ignore_index=True)
    
    # Identifying indexes of rows that are not outliers to keep in the original dataframe
    non_outliers_index = var_df[(var_df['value'] >= lower_bound) & (var_df['value'] <= upper_bound)].index
    
    # Update df to only include rows that are not outliers for the current variable
    df1 = df1[(df1.index.isin(non_outliers_index)) | (df1['variable'] != variable)]


In [84]:
score_variables = ["mood", "circumplex.arousal", "circumplex.valence", "activity"]

# Creating a dataset with only the selected variables
df_score = df1[df1['variable'].isin(score_variables)]

# Creating another dataset with the rest of the variables
df_machine = df1[~df1['variable'].isin(score_variables)]

df_score['date'] = df_score['time'].dt.date
df_machine['date'] = df_machine['time'].dt.date


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_score['date'] = df_score['time'].dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_machine['date'] = df_machine['time'].dt.date


In [85]:
df_scores_daily = df_score.drop('time', axis = 1)
df_machine_daily = df_machine.drop('time', axis = 1)

In [86]:
grouped_scores = df_scores_daily.groupby(['id', 'date', 'variable'])['value'].mean().reset_index()
grouped_scores
grouped_times = df_machine_daily.groupby(['id', 'date', 'variable'])['value'].sum().reset_index()
grouped_times

Unnamed: 0,id,date,variable,value
0,AS14.01,2014-02-17,call,2.000000
1,AS14.01,2014-02-18,call,1.000000
2,AS14.01,2014-02-19,call,7.000000
3,AS14.01,2014-02-19,sms,2.000000
4,AS14.01,2014-02-20,call,2.000000
...,...,...,...,...
10255,AS14.33,2014-05-30,appCat.travel,915.714000
10256,AS14.33,2014-05-30,appCat.unknown,8.072000
10257,AS14.33,2014-05-30,appCat.utilities,155.922000
10258,AS14.33,2014-05-30,call,4.000000


In [87]:
scores_pivot_df = df_scores_daily.pivot_table(index=['id','date'], columns='variable', values='value', aggfunc='mean').reset_index()
scores_pivot_df

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood
0,AS14.01,2014-02-26,,-0.250000,0.750000,6.250000
1,AS14.01,2014-02-27,,0.000000,0.333333,6.333333
2,AS14.01,2014-03-20,0.081548,,,
3,AS14.01,2014-03-21,0.134050,0.200000,0.200000,6.200000
4,AS14.01,2014-03-22,0.236880,0.600000,0.500000,6.400000
...,...,...,...,...,...,...
1301,AS14.33,2014-05-27,0.012704,-0.600000,0.400000,6.200000
1302,AS14.33,2014-05-28,0.103301,0.000000,1.200000,8.200000
1303,AS14.33,2014-05-29,0.169354,-1.333333,1.000000,7.000000
1304,AS14.33,2014-05-30,0.192901,-0.800000,-0.400000,6.800000


In [88]:
time_pivot_df = df_machine_daily.pivot_table(index=['id','date'], columns='variable', values='value', aggfunc='sum').reset_index()

In [89]:
app_categories_columns = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather"
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in app_categories_columns if col in time_pivot_df.columns]

# Replace NaN values with 0 for the specified app category columns
time_pivot_df[existing_app_columns] = time_pivot_df[existing_app_columns].fillna(0)
time_pivot_df



variable,id,date,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-17,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,2.0,,
1,AS14.01,2014-02-18,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,1.0,,
2,AS14.01,2014-02-19,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,7.0,,2.0
3,AS14.01,2014-02-20,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,2.0,,3.0
4,AS14.01,2014-02-21,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910,AS14.33,2014-05-26,571.902,750.571,79.963,0.0,0.0,0.000,140.446,1508.191,0.000,0.000,0.000,0.0,10.0,1118.782999,3.0
1911,AS14.33,2014-05-27,269.550,771.182,32.137,0.0,0.0,0.000,57.535,1252.079,0.000,0.000,56.173,0.0,1.0,1508.516001,2.0
1912,AS14.33,2014-05-28,1093.741,1057.568,153.121,0.0,0.0,38.262,208.962,1817.695,0.000,0.000,30.666,0.0,10.0,3414.742998,1.0
1913,AS14.33,2014-05-29,248.063,198.112,5.018,0.0,0.0,0.000,29.202,377.179,0.939,0.000,3.199,0.0,5.0,395.046000,1.0


In [90]:
merged_df = pd.merge(scores_pivot_df, time_pivot_df, on=['id','date'], how='outer')
merged_df

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-17,,,,,0.000,0.000,0.000,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.0,2.0,,
1,AS14.01,2014-02-18,,,,,0.000,0.000,0.000,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.0,1.0,,
2,AS14.01,2014-02-19,,,,,0.000,0.000,0.000,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.0,7.0,,2.0
3,AS14.01,2014-02-20,,,,,0.000,0.000,0.000,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.0,2.0,,3.0
4,AS14.01,2014-02-21,,,,,0.000,0.000,0.000,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.0,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968,AS14.33,2014-05-27,0.012704,-0.600000,0.4,6.2,269.550,771.182,32.137,0.0,...,0.000,57.535,1252.079,0.000,0.000,56.173,0.0,1.0,1508.516001,2.0
1969,AS14.33,2014-05-28,0.103301,0.000000,1.2,8.2,1093.741,1057.568,153.121,0.0,...,38.262,208.962,1817.695,0.000,0.000,30.666,0.0,10.0,3414.742998,1.0
1970,AS14.33,2014-05-29,0.169354,-1.333333,1.0,7.0,248.063,198.112,5.018,0.0,...,0.000,29.202,377.179,0.939,0.000,3.199,0.0,5.0,395.046000,1.0
1971,AS14.33,2014-05-30,0.192901,-0.800000,-0.4,6.8,930.885,992.321,21.178,0.0,...,0.000,52.610,1706.765,915.714,8.072,155.922,0.0,4.0,3525.688999,


NOW SPLIT INTO TRAIN TEST SETS, THEN ADD ALL INTERPOLATION STUFF TO BOTH

In [91]:
tscv = TimeSeriesSplit(n_splits=2) #ONLY 1 SPLIT = change to 5 for kfoldcross
merged_df = merged_df.sort_values('date') 
for train_index, test_index in tscv.split(merged_df):
    train_df = merged_df.iloc[train_index]
    test_df = merged_df.iloc[test_index]
test_df = test_df.sort_values(by=['id', 'date'])
train_df = train_df.sort_values(by=['id', 'date'])

In [92]:
columns_to_interpolate = ['call',  'sms']


train_df[columns_to_interpolate] = train_df[columns_to_interpolate].interpolate(method='linear', limit_direction='forward', axis=0)
train_df[columns_to_interpolate] = train_df[columns_to_interpolate].interpolate(method='linear', limit_direction='backward', axis=0)
test_df[columns_to_interpolate] = test_df[columns_to_interpolate].interpolate(method='linear', limit_direction='forward', axis=0)
test_df[columns_to_interpolate] = test_df[columns_to_interpolate].interpolate(method='linear', limit_direction='backward', axis=0)
train_df

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-17,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,2.0
1,AS14.01,2014-02-18,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,2.0
2,AS14.01,2014-02-19,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,,2.0
3,AS14.01,2014-02-20,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,3.0
4,AS14.01,2014-02-21,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1922,AS14.33,2014-04-11,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,1.0
1923,AS14.33,2014-04-12,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,1.0
1924,AS14.33,2014-04-13,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,1.0
1925,AS14.33,2014-04-14,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,1.0


In [93]:
columns_to_interpolate = ['call', 'sms', 'activity']
train_df[columns_to_interpolate] = train_df.groupby('id')[columns_to_interpolate].transform(lambda group: group.interpolate(method='linear', limit_direction='both', axis=0))
test_df[columns_to_interpolate] = test_df.groupby('id')[columns_to_interpolate].transform(lambda group: group.interpolate(method='linear', limit_direction='both', axis=0))

train_df

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-17,0.081548,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,2.0
1,AS14.01,2014-02-18,0.081548,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,2.0
2,AS14.01,2014-02-19,0.081548,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,,2.0
3,AS14.01,2014-02-20,0.081548,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,3.0
4,AS14.01,2014-02-21,0.081548,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1922,AS14.33,2014-04-11,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,1.0
1923,AS14.33,2014-04-12,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,1.0
1924,AS14.33,2014-04-13,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,1.0
1925,AS14.33,2014-04-14,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,1.0


In [94]:
columns_to_check = ['mood']


train_df_cleaned = train_df.dropna(subset=columns_to_check)
test_df = test_df.dropna(subset=columns_to_check)
train_df_cleaned

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
7,AS14.01,2014-02-26,0.081548,-0.250000,0.750000,6.250000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,1.0,,2.000000
8,AS14.01,2014-02-27,0.081548,0.000000,0.333333,6.333333,,,,,...,,,,,,,,2.5,,1.666667
26,AS14.01,2014-03-21,0.134050,0.200000,0.200000,6.200000,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.0,229.138,0.000,6.0,2867.963000,1.250000
27,AS14.01,2014-03-22,0.236880,0.600000,0.500000,6.400000,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.0,53.188,0.000,3.0,3038.464000,1.000000
28,AS14.01,2014-03-23,0.142741,0.200000,0.800000,6.800000,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.0,30.086,30.386,6.5,2780.277001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,AS14.32,2014-04-11,0.133644,0.333333,0.333333,7.666667,281.537,1247.846,423.783,0.000,...,0.000,58.868,0.000,0.000,0.0,13.118,0.000,6.0,2786.956000,1.000000
1839,AS14.32,2014-04-12,0.092561,0.200000,0.800000,8.600000,169.231,649.200,463.088,0.000,...,0.000,49.503,92.404,0.000,0.0,0.000,0.000,5.0,1274.172000,2.000000
1840,AS14.32,2014-04-13,0.114868,-1.800000,0.600000,8.000000,155.240,765.842,327.590,0.000,...,0.000,82.591,36.799,32.605,0.0,0.000,0.000,6.0,1311.709999,2.000000
1841,AS14.32,2014-04-14,0.092765,0.500000,0.500000,7.500000,317.907,1107.541,535.738,0.000,...,0.000,54.104,0.000,32.210,0.0,0.000,0.000,3.0,1886.964999,2.000000


In [95]:
not_null_columns = [
    "circumplex.valence"
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in not_null_columns if col in train_df_cleaned.columns]

# Replace NaN values with 0 for the specified app category columns
train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)
test_df[existing_app_columns] = test_df[existing_app_columns].fillna(0)
train_df_cleaned


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)


variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
7,AS14.01,2014-02-26,0.081548,-0.250000,0.750000,6.250000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,1.0,,2.000000
8,AS14.01,2014-02-27,0.081548,0.000000,0.333333,6.333333,,,,,...,,,,,,,,2.5,,1.666667
26,AS14.01,2014-03-21,0.134050,0.200000,0.200000,6.200000,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.0,229.138,0.000,6.0,2867.963000,1.250000
27,AS14.01,2014-03-22,0.236880,0.600000,0.500000,6.400000,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.0,53.188,0.000,3.0,3038.464000,1.000000
28,AS14.01,2014-03-23,0.142741,0.200000,0.800000,6.800000,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.0,30.086,30.386,6.5,2780.277001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,AS14.32,2014-04-11,0.133644,0.333333,0.333333,7.666667,281.537,1247.846,423.783,0.000,...,0.000,58.868,0.000,0.000,0.0,13.118,0.000,6.0,2786.956000,1.000000
1839,AS14.32,2014-04-12,0.092561,0.200000,0.800000,8.600000,169.231,649.200,463.088,0.000,...,0.000,49.503,92.404,0.000,0.0,0.000,0.000,5.0,1274.172000,2.000000
1840,AS14.32,2014-04-13,0.114868,-1.800000,0.600000,8.000000,155.240,765.842,327.590,0.000,...,0.000,82.591,36.799,32.605,0.0,0.000,0.000,6.0,1311.709999,2.000000
1841,AS14.32,2014-04-14,0.092765,0.500000,0.500000,7.500000,317.907,1107.541,535.738,0.000,...,0.000,54.104,0.000,32.210,0.0,0.000,0.000,3.0,1886.964999,2.000000


In [96]:
app_categories_columns = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather",
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in app_categories_columns if col in train_df_cleaned.columns]

# Replace NaN values with 0 for the specified app category columns
train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)
test_df[existing_app_columns] = test_df[existing_app_columns].fillna(0)
train_df_cleaned


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)


variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
7,AS14.01,2014-02-26,0.081548,-0.250000,0.750000,6.250000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,1.0,,2.000000
8,AS14.01,2014-02-27,0.081548,0.000000,0.333333,6.333333,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,2.5,,1.666667
26,AS14.01,2014-03-21,0.134050,0.200000,0.200000,6.200000,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.0,229.138,0.000,6.0,2867.963000,1.250000
27,AS14.01,2014-03-22,0.236880,0.600000,0.500000,6.400000,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.0,53.188,0.000,3.0,3038.464000,1.000000
28,AS14.01,2014-03-23,0.142741,0.200000,0.800000,6.800000,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.0,30.086,30.386,6.5,2780.277001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,AS14.32,2014-04-11,0.133644,0.333333,0.333333,7.666667,281.537,1247.846,423.783,0.000,...,0.000,58.868,0.000,0.000,0.0,13.118,0.000,6.0,2786.956000,1.000000
1839,AS14.32,2014-04-12,0.092561,0.200000,0.800000,8.600000,169.231,649.200,463.088,0.000,...,0.000,49.503,92.404,0.000,0.0,0.000,0.000,5.0,1274.172000,2.000000
1840,AS14.32,2014-04-13,0.114868,-1.800000,0.600000,8.000000,155.240,765.842,327.590,0.000,...,0.000,82.591,36.799,32.605,0.0,0.000,0.000,6.0,1311.709999,2.000000
1841,AS14.32,2014-04-14,0.092765,0.500000,0.500000,7.500000,317.907,1107.541,535.738,0.000,...,0.000,54.104,0.000,32.210,0.0,0.000,0.000,3.0,1886.964999,2.000000


In [97]:
columns_to_sum = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather"
]

# Use apply to iterate over rows for rows where 'screen' is NaN
train_df_cleaned.loc[train_df_cleaned['screen'].isna(), 'screen'] = train_df_cleaned[train_df_cleaned['screen'].isna()].apply(
    lambda row: row[columns_to_sum].sum(), axis=1)
test_df.loc[test_df['screen'].isna(), 'screen'] = test_df[test_df['screen'].isna()].apply(
    lambda row: row[columns_to_sum].sum(), axis=1)

train_df_cleaned

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
7,AS14.01,2014-02-26,0.081548,-0.250000,0.750000,6.250000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,1.0,0.000000,2.000000
8,AS14.01,2014-02-27,0.081548,0.000000,0.333333,6.333333,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,2.5,0.000000,1.666667
26,AS14.01,2014-03-21,0.134050,0.200000,0.200000,6.200000,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.0,229.138,0.000,6.0,2867.963000,1.250000
27,AS14.01,2014-03-22,0.236880,0.600000,0.500000,6.400000,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.0,53.188,0.000,3.0,3038.464000,1.000000
28,AS14.01,2014-03-23,0.142741,0.200000,0.800000,6.800000,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.0,30.086,30.386,6.5,2780.277001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,AS14.32,2014-04-11,0.133644,0.333333,0.333333,7.666667,281.537,1247.846,423.783,0.000,...,0.000,58.868,0.000,0.000,0.0,13.118,0.000,6.0,2786.956000,1.000000
1839,AS14.32,2014-04-12,0.092561,0.200000,0.800000,8.600000,169.231,649.200,463.088,0.000,...,0.000,49.503,92.404,0.000,0.0,0.000,0.000,5.0,1274.172000,2.000000
1840,AS14.32,2014-04-13,0.114868,-1.800000,0.600000,8.000000,155.240,765.842,327.590,0.000,...,0.000,82.591,36.799,32.605,0.0,0.000,0.000,6.0,1311.709999,2.000000
1841,AS14.32,2014-04-14,0.092765,0.500000,0.500000,7.500000,317.907,1107.541,535.738,0.000,...,0.000,54.104,0.000,32.210,0.0,0.000,0.000,3.0,1886.964999,2.000000


In [98]:
def custom_3day_avg(series):

    result = np.empty_like(series, dtype=float)
    result[:] = np.nan  # Initialize with NaN
    
    # Iterate over the series using index for access
    for i in range(len(series)):
        if i == 0:
            result[i] = np.nan  # First day is NaN
        elif i == 1:
            result[i] = series.iloc[i-1]  # Second day is the value of the first day
        elif i == 2:
            result[i] = (series.iloc[i-2] + series.iloc[i-1]) / 2  # Third day is the average of the first two days
        else:
            result[i] = (series.iloc[i-3] + series.iloc[i-2] + series.iloc[i-1]) / 3  # From the fourth day, use a regular 3-day average
    
    return pd.Series(result, index=series.index)
    
train_df_cleaned['3_day_avg_mood'] = train_df_cleaned.groupby('id')['mood'].transform(custom_3day_avg)
train_df_cleaned['3_day_avg_activity'] = train_df_cleaned.groupby('id')['activity'].transform(custom_3day_avg)
train_df_cleaned['3_day_avg_mood'] = train_df_cleaned.groupby('id')['3_day_avg_mood'].fillna(method='ffill')
train_df_cleaned['3_day_avg_activity'] = train_df_cleaned.groupby('id')['3_day_avg_activity'].fillna(method='ffill')
train_df_cleaned['3_day_avg_mood'] = train_df_cleaned.groupby('id')['3_day_avg_mood'].fillna(method='bfill')
train_df_cleaned['3_day_avg_activity'] = train_df_cleaned.groupby('id')['3_day_avg_activity'].fillna(method='bfill')
test_df['3_day_avg_mood'] = test_df.groupby('id')['mood'].transform(custom_3day_avg)
test_df['3_day_avg_activity'] = test_df.groupby('id')['activity'].transform(custom_3day_avg)
test_df['3_day_avg_mood'] = test_df.groupby('id')['3_day_avg_mood'].fillna(method='ffill')
test_df['3_day_avg_activity'] = test_df.groupby('id')['3_day_avg_activity'].fillna(method='ffill')
test_df['3_day_avg_mood'] = test_df.groupby('id')['3_day_avg_mood'].fillna(method='bfill')
test_df['3_day_avg_activity'] = test_df.groupby('id')['3_day_avg_activity'].fillna(method='bfill')
train_df_cleaned


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['3_day_avg_mood'] = train_df_cleaned.groupby('id')['mood'].transform(custom_3day_avg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['3_day_avg_activity'] = train_df_cleaned.groupby('id')['activity'].transform(custom_3day_avg)
  train_df_cleaned['3_day_avg_mood'] = train_df_cleaned.groupby('id')['3_day_avg_mood'].fillna(method='ffill')
  train_df_cleaned['3_day_avg_mood'] = train_df_cleaned.groupby('id')['3_day_avg_mood'].fillna(method='ffill')
A value is trying to be set on a c

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms,3_day_avg_mood,3_day_avg_activity
7,AS14.01,2014-02-26,0.081548,-0.250000,0.750000,6.250000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.0,0.000,0.000,1.0,0.000000,2.000000,6.250000,0.081548
8,AS14.01,2014-02-27,0.081548,0.000000,0.333333,6.333333,0.000,0.000,0.000,0.000,...,0.000,0.000,0.0,0.000,0.000,2.5,0.000000,1.666667,6.250000,0.081548
26,AS14.01,2014-03-21,0.134050,0.200000,0.200000,6.200000,1090.617,2850.042,100.898,49.544,...,3121.747,167.373,0.0,229.138,0.000,6.0,2867.963000,1.250000,6.291667,0.081548
27,AS14.01,2014-03-22,0.236880,0.600000,0.500000,6.400000,476.737,1946.683,4.010,21.076,...,439.632,37.305,0.0,53.188,0.000,3.0,3038.464000,1.000000,6.261111,0.099048
28,AS14.01,2014-03-23,0.142741,0.200000,0.800000,6.800000,540.915,2680.535,4.028,43.403,...,649.414,0.000,0.0,30.086,30.386,6.5,2780.277001,1.000000,6.311111,0.150826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,AS14.32,2014-04-11,0.133644,0.333333,0.333333,7.666667,281.537,1247.846,423.783,0.000,...,0.000,0.000,0.0,13.118,0.000,6.0,2786.956000,1.000000,7.794444,0.079512
1839,AS14.32,2014-04-12,0.092561,0.200000,0.800000,8.600000,169.231,649.200,463.088,0.000,...,92.404,0.000,0.0,0.000,0.000,5.0,1274.172000,2.000000,7.750000,0.100185
1840,AS14.32,2014-04-13,0.114868,-1.800000,0.600000,8.000000,155.240,765.842,327.590,0.000,...,36.799,32.605,0.0,0.000,0.000,6.0,1311.709999,2.000000,8.172222,0.103862
1841,AS14.32,2014-04-14,0.092765,0.500000,0.500000,7.500000,317.907,1107.541,535.738,0.000,...,0.000,32.210,0.0,0.000,0.000,3.0,1886.964999,2.000000,8.088889,0.113691


In [99]:
train_df_cleaned['productivity_to_social_ratio'] = (train_df_cleaned['appCat.finance'] + train_df_cleaned['appCat.office']) +1/ (train_df_cleaned['appCat.entertainment'] + train_df_cleaned['appCat.social']+1)
test_df['productivity_to_social_ratio'] = (test_df['appCat.finance'] + test_df['appCat.office']) +1/ (test_df['appCat.entertainment'] + test_df['appCat.social']+1)
train_df_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['productivity_to_social_ratio'] = (train_df_cleaned['appCat.finance'] + train_df_cleaned['appCat.office']) +1/ (train_df_cleaned['appCat.entertainment'] + train_df_cleaned['appCat.social']+1)


variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms,3_day_avg_mood,3_day_avg_activity,productivity_to_social_ratio
7,AS14.01,2014-02-26,0.081548,-0.250000,0.750000,6.250000,0.000,0.000,0.000,0.000,...,0.000,0.0,0.000,0.000,1.0,0.000000,2.000000,6.250000,0.081548,1.000000
8,AS14.01,2014-02-27,0.081548,0.000000,0.333333,6.333333,0.000,0.000,0.000,0.000,...,0.000,0.0,0.000,0.000,2.5,0.000000,1.666667,6.250000,0.081548,1.000000
26,AS14.01,2014-03-21,0.134050,0.200000,0.200000,6.200000,1090.617,2850.042,100.898,49.544,...,167.373,0.0,229.138,0.000,6.0,2867.963000,1.250000,6.291667,0.081548,55.023310
27,AS14.01,2014-03-22,0.236880,0.600000,0.500000,6.400000,476.737,1946.683,4.010,21.076,...,37.305,0.0,53.188,0.000,3.0,3038.464000,1.000000,6.261111,0.099048,21.078249
28,AS14.01,2014-03-23,0.142741,0.200000,0.800000,6.800000,540.915,2680.535,4.028,43.403,...,0.000,0.0,30.086,30.386,6.5,2780.277001,1.000000,6.311111,0.150826,43.404528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,AS14.32,2014-04-11,0.133644,0.333333,0.333333,7.666667,281.537,1247.846,423.783,0.000,...,0.000,0.0,13.118,0.000,6.0,2786.956000,1.000000,7.794444,0.079512,0.002354
1839,AS14.32,2014-04-12,0.092561,0.200000,0.800000,8.600000,169.231,649.200,463.088,0.000,...,0.000,0.0,0.000,0.000,5.0,1274.172000,2.000000,7.750000,0.100185,0.001797
1840,AS14.32,2014-04-13,0.114868,-1.800000,0.600000,8.000000,155.240,765.842,327.590,0.000,...,32.605,0.0,0.000,0.000,6.0,1311.709999,2.000000,8.172222,0.103862,0.002737
1841,AS14.32,2014-04-14,0.092765,0.500000,0.500000,7.500000,317.907,1107.541,535.738,0.000,...,32.210,0.0,0.000,0.000,3.0,1886.964999,2.000000,8.088889,0.113691,0.001863


In [100]:
# Assuming 'df' is your DataFrame and 'grade' is your continuous variable
train_df_cleaned['mood_quantiles'] = pd.qcut(train_df_cleaned['mood'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
test_df['mood_quantiles'] = pd.qcut(test_df['mood'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
train_df_cleaned


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['mood_quantiles'] = pd.qcut(train_df_cleaned['mood'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])


variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms,3_day_avg_mood,3_day_avg_activity,productivity_to_social_ratio,mood_quantiles
7,AS14.01,2014-02-26,0.081548,-0.250000,0.750000,6.250000,0.000,0.000,0.000,0.000,...,0.0,0.000,0.000,1.0,0.000000,2.000000,6.250000,0.081548,1.000000,Q1
8,AS14.01,2014-02-27,0.081548,0.000000,0.333333,6.333333,0.000,0.000,0.000,0.000,...,0.0,0.000,0.000,2.5,0.000000,1.666667,6.250000,0.081548,1.000000,Q1
26,AS14.01,2014-03-21,0.134050,0.200000,0.200000,6.200000,1090.617,2850.042,100.898,49.544,...,0.0,229.138,0.000,6.0,2867.963000,1.250000,6.291667,0.081548,55.023310,Q1
27,AS14.01,2014-03-22,0.236880,0.600000,0.500000,6.400000,476.737,1946.683,4.010,21.076,...,0.0,53.188,0.000,3.0,3038.464000,1.000000,6.261111,0.099048,21.078249,Q1
28,AS14.01,2014-03-23,0.142741,0.200000,0.800000,6.800000,540.915,2680.535,4.028,43.403,...,0.0,30.086,30.386,6.5,2780.277001,1.000000,6.311111,0.150826,43.404528,Q2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,AS14.32,2014-04-11,0.133644,0.333333,0.333333,7.666667,281.537,1247.846,423.783,0.000,...,0.0,13.118,0.000,6.0,2786.956000,1.000000,7.794444,0.079512,0.002354,Q4
1839,AS14.32,2014-04-12,0.092561,0.200000,0.800000,8.600000,169.231,649.200,463.088,0.000,...,0.0,0.000,0.000,5.0,1274.172000,2.000000,7.750000,0.100185,0.001797,Q4
1840,AS14.32,2014-04-13,0.114868,-1.800000,0.600000,8.000000,155.240,765.842,327.590,0.000,...,0.0,0.000,0.000,6.0,1311.709999,2.000000,8.172222,0.103862,0.002737,Q4
1841,AS14.32,2014-04-14,0.092765,0.500000,0.500000,7.500000,317.907,1107.541,535.738,0.000,...,0.0,0.000,0.000,3.0,1886.964999,2.000000,8.088889,0.113691,0.001863,Q3


In [101]:
train_df_cleaned.to_csv('train.csv') 
test_df.to_csv('test.csv')