In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("dataset_mood_smartphone.csv")
df['time'] = pd.to_datetime(df['time'])
df['date'] = df['time'].dt.date
df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)

df

Unnamed: 0,index,id,time,variable,value,date
0,1,AS14.01,2014-02-26 13:00:00.000,mood,6.000,2014-02-26
1,2,AS14.01,2014-02-26 15:00:00.000,mood,6.000,2014-02-26
2,3,AS14.01,2014-02-26 18:00:00.000,mood,6.000,2014-02-26
3,4,AS14.01,2014-02-26 21:00:00.000,mood,7.000,2014-02-26
4,5,AS14.01,2014-02-27 09:00:00.000,mood,6.000,2014-02-27
...,...,...,...,...,...,...
376907,2770399,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032,2014-04-11
376908,2772465,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008,2014-04-19
376909,2774026,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026,2014-04-26
376910,2774133,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033,2014-04-27


In [3]:
df_filtered = df[df['variable'].isin(['sms', 'call'])]

# Aggregate counts by individual and event type
counts = df_filtered.groupby(['id', 'variable'])['value'].size().reset_index(name='count')

# Determine the IQR for outlier detection
Q1 = counts['count'].quantile(0.25)
Q3 = counts['count'].quantile(0.75)
IQR = Q3 - Q1
outliers_threshold_low = Q1 - 1.5 * IQR
outliers_threshold_high = Q3 + 1.5 * IQR

# Identify outliers
outliers = counts[(counts['count'] < outliers_threshold_low) | (counts['count'] > outliers_threshold_high)]

In [4]:
# Filter for 'sms' and 'call' events
df_sms_call = df[df['variable'].isin(['sms', 'call'])]

# Group by individual ('id'), variable, and date to count daily events
daily_counts = df_sms_call.groupby(['id', 'variable', df_sms_call['time'].dt.date]).size().reset_index(name='daily_count')


In [5]:
# Calculate IQR for daily counts
Q1 = daily_counts['daily_count'].quantile(0.25)
Q3 = daily_counts['daily_count'].quantile(0.75)
IQR = Q3 - Q1

# Define outliers as those beyond 1.5 times the IQR from the quartiles
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outlier rows based on daily counts
outliers = daily_counts[(daily_counts['daily_count'] < lower_bound) | (daily_counts['daily_count'] > upper_bound)]


In [6]:
nan_rows_arousal = df[(df['variable'] == 'circumplex.arousal') & (df['value'].isna())].index
nan_rows_valence = df[(df['variable'] == 'circumplex.valence') & (df['value'].isna())].index
nan_rows_activity = df[(df['variable'] == 'activity') & (df['value'].isna())].index


# Combine the indices of rows with NaN values for arousal and valence
nan_rows_combined = nan_rows_arousal.union(nan_rows_valence)

In [7]:
# List of variables/categories to check for negative values, excluding mood, arousal, and valence
variables_to_check = [variable for variable in df['variable'].unique() if variable not in ['mood', 'circumplex.arousal', 'circumplex.valence']]

# Check for negative values in the remaining variables
negative_values_check = {variable: (df[df['variable'] == variable]['value'] < 0).any() for variable in variables_to_check}

negative_values_check

# Identify rows with negative values in appCat.builtin and appCat.entertainment in the original dataset
negative_values_builtin = df[(df['variable'] == 'appCat.builtin') & (df['value'] < 0)].index
negative_values_entertainment = df[(df['variable'] == 'appCat.entertainment') & (df['value'] < 0)].index

#neg combined 
neg = negative_values_builtin.union(negative_values_entertainment)
# Combine the indices of rows with negative values for appCat.builtin and appCat.entertainment
# with previously identified NaN rows for removal
remove_combined = nan_rows_combined.union(negative_values_builtin).union(negative_values_entertainment)

df_negative = df.loc[neg]

In [8]:
combined = df.loc[remove_combined]
combined
df1 = df.drop(combined.index)

In [9]:
# Initialize a DataFrame to store outliers
outliers_df = pd.DataFrame()

variables_for_box_plots_all = [
    'screen', 'appCat.builtin', 'appCat.communication', 'appCat.entertainment',
    'appCat.finance', 'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social',
    'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather'
]
# Iterate over each variable to remove outliers, saving them first
for variable in variables_for_box_plots_all:
    # Isolate the current variable's data
    var_df = df1[df1['variable'] == variable]
    
    # Calculate IQR and determine bounds for outliers
    Q1 = var_df['value'].quantile(0.25)
    Q3 = var_df['value'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identifying outliers
    outliers = var_df[(var_df['value'] < lower_bound) | (var_df['value'] > upper_bound)]
    
    # Append outliers to the outliers_df DataFrame
    outliers_df = pd.concat([outliers_df, outliers], ignore_index=True)
    
    # Identifying indexes of rows that are not outliers to keep in the original dataframe
    non_outliers_index = var_df[(var_df['value'] >= lower_bound) & (var_df['value'] <= upper_bound)].index
    
    # Update df to only include rows that are not outliers for the current variable
    df1 = df1[(df1.index.isin(non_outliers_index)) | (df1['variable'] != variable)]


In [10]:
score_variables = ["mood", "circumplex.arousal", "circumplex.valence", "activity"]

# Creating a dataset with only the selected variables
df_score = df1[df1['variable'].isin(score_variables)]

# Creating another dataset with the rest of the variables
df_machine = df1[~df1['variable'].isin(score_variables)]

df_score['date'] = df_score['time'].dt.date
df_machine['date'] = df_machine['time'].dt.date


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_score['date'] = df_score['time'].dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_machine['date'] = df_machine['time'].dt.date


In [11]:
df_scores_daily = df_score.drop('time', axis = 1)
df_machine_daily = df_machine.drop('time', axis = 1)

In [12]:
grouped_scores = df_scores_daily.groupby(['id', 'date', 'variable'])['value'].mean().reset_index()
grouped_scores
grouped_times = df_machine_daily.groupby(['id', 'date', 'variable'])['value'].sum().reset_index()
grouped_times

Unnamed: 0,id,date,variable,value
0,AS14.01,2014-02-17,call,2.000000
1,AS14.01,2014-02-18,call,1.000000
2,AS14.01,2014-02-19,call,7.000000
3,AS14.01,2014-02-19,sms,2.000000
4,AS14.01,2014-02-20,call,2.000000
...,...,...,...,...
10255,AS14.33,2014-05-30,appCat.travel,915.714000
10256,AS14.33,2014-05-30,appCat.unknown,8.072000
10257,AS14.33,2014-05-30,appCat.utilities,155.922000
10258,AS14.33,2014-05-30,call,4.000000


In [13]:
scores_pivot_df = df_scores_daily.pivot_table(index=['id','date'], columns='variable', values='value', aggfunc='mean').reset_index()
scores_pivot_df

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood
0,AS14.01,2014-02-26,,-0.250000,0.750000,6.250000
1,AS14.01,2014-02-27,,0.000000,0.333333,6.333333
2,AS14.01,2014-03-20,0.081548,,,
3,AS14.01,2014-03-21,0.134050,0.200000,0.200000,6.200000
4,AS14.01,2014-03-22,0.236880,0.600000,0.500000,6.400000
...,...,...,...,...,...,...
1301,AS14.33,2014-05-27,0.012704,-0.600000,0.400000,6.200000
1302,AS14.33,2014-05-28,0.103301,0.000000,1.200000,8.200000
1303,AS14.33,2014-05-29,0.169354,-1.333333,1.000000,7.000000
1304,AS14.33,2014-05-30,0.192901,-0.800000,-0.400000,6.800000


In [14]:
time_pivot_df = df_machine_daily.pivot_table(index=['id','date'], columns='variable', values='value', aggfunc='sum').reset_index()

In [15]:
app_categories_columns = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather"
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in app_categories_columns if col in time_pivot_df.columns]

# Replace NaN values with 0 for the specified app category columns
time_pivot_df[existing_app_columns] = time_pivot_df[existing_app_columns].fillna(0)
time_pivot_df



variable,id,date,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-17,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,2.0,,
1,AS14.01,2014-02-18,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,1.0,,
2,AS14.01,2014-02-19,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,7.0,,2.0
3,AS14.01,2014-02-20,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,2.0,,3.0
4,AS14.01,2014-02-21,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.0,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910,AS14.33,2014-05-26,571.902,750.571,79.963,0.0,0.0,0.000,140.446,1508.191,0.000,0.000,0.000,0.0,10.0,1118.782999,3.0
1911,AS14.33,2014-05-27,269.550,771.182,32.137,0.0,0.0,0.000,57.535,1252.079,0.000,0.000,56.173,0.0,1.0,1508.516001,2.0
1912,AS14.33,2014-05-28,1093.741,1057.568,153.121,0.0,0.0,38.262,208.962,1817.695,0.000,0.000,30.666,0.0,10.0,3414.742998,1.0
1913,AS14.33,2014-05-29,248.063,198.112,5.018,0.0,0.0,0.000,29.202,377.179,0.939,0.000,3.199,0.0,5.0,395.046000,1.0


In [16]:
merged_df = pd.merge(scores_pivot_df, time_pivot_df, on=['id','date'], how='inner')
merged_df

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-26,,-0.250000,0.75,6.25,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.0,,2.0
1,AS14.01,2014-03-20,0.081548,,,,121.466,980.911,42.593,0.000,...,0.000,11.345,621.094,0.000,45.173,21.074,0.000,1.0,128.843000,
2,AS14.01,2014-03-21,0.134050,0.200000,0.20,6.20,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.000,229.138,0.000,6.0,2867.963000,
3,AS14.01,2014-03-22,0.236880,0.600000,0.50,6.40,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.000,53.188,0.000,3.0,3038.464000,1.0
4,AS14.01,2014-03-23,0.142741,0.200000,0.80,6.80,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.000,30.086,30.386,,2780.277001,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,AS14.33,2014-05-26,0.115201,-0.400000,0.00,5.40,571.902,750.571,79.963,0.000,...,0.000,140.446,1508.191,0.000,0.000,0.000,0.000,10.0,1118.782999,3.0
1244,AS14.33,2014-05-27,0.012704,-0.600000,0.40,6.20,269.550,771.182,32.137,0.000,...,0.000,57.535,1252.079,0.000,0.000,56.173,0.000,1.0,1508.516001,2.0
1245,AS14.33,2014-05-28,0.103301,0.000000,1.20,8.20,1093.741,1057.568,153.121,0.000,...,38.262,208.962,1817.695,0.000,0.000,30.666,0.000,10.0,3414.742998,1.0
1246,AS14.33,2014-05-29,0.169354,-1.333333,1.00,7.00,248.063,198.112,5.018,0.000,...,0.000,29.202,377.179,0.939,0.000,3.199,0.000,5.0,395.046000,1.0


NOW SPLIT INTO TRAIN TEST SETS, THEN ADD ALL INTERPOLATION STUFF TO BOTH

In [17]:
#tscv = TimeSeriesSplit(n_splits=2) #ONLY 1 SPLIT = change to 5 for kfoldcross
train_df = merged_df

In [18]:
columns_to_interpolate = ['call',  'sms']
train_df[columns_to_interpolate] = train_df[columns_to_interpolate].interpolate(method='linear', limit_direction='forward', axis=0)
mean_activity_per_id = train_df.groupby('id')['activity'].mean()
train_df['activity'] = train_df.apply(
    lambda row: mean_activity_per_id[row['id']] if pd.isna(row['activity']) else row['activity'],
    axis=1
)
#test_df[columns_to_interpolate] = test_df[columns_to_interpolate].interpolate(method='linear', limit_direction='forward', axis=0)
train_df

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-26,0.090464,-0.250000,0.75,6.25,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.0,,2.000000
1,AS14.01,2014-03-20,0.081548,,,,121.466,980.911,42.593,0.000,...,0.000,11.345,621.094,0.000,45.173,21.074,0.000,1.0,128.843000,1.666667
2,AS14.01,2014-03-21,0.134050,0.200000,0.20,6.20,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.000,229.138,0.000,6.0,2867.963000,1.333333
3,AS14.01,2014-03-22,0.236880,0.600000,0.50,6.40,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.000,53.188,0.000,3.0,3038.464000,1.000000
4,AS14.01,2014-03-23,0.142741,0.200000,0.80,6.80,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.000,30.086,30.386,6.5,2780.277001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,AS14.33,2014-05-26,0.115201,-0.400000,0.00,5.40,571.902,750.571,79.963,0.000,...,0.000,140.446,1508.191,0.000,0.000,0.000,0.000,10.0,1118.782999,3.000000
1244,AS14.33,2014-05-27,0.012704,-0.600000,0.40,6.20,269.550,771.182,32.137,0.000,...,0.000,57.535,1252.079,0.000,0.000,56.173,0.000,1.0,1508.516001,2.000000
1245,AS14.33,2014-05-28,0.103301,0.000000,1.20,8.20,1093.741,1057.568,153.121,0.000,...,38.262,208.962,1817.695,0.000,0.000,30.666,0.000,10.0,3414.742998,1.000000
1246,AS14.33,2014-05-29,0.169354,-1.333333,1.00,7.00,248.063,198.112,5.018,0.000,...,0.000,29.202,377.179,0.939,0.000,3.199,0.000,5.0,395.046000,1.000000


In [19]:
columns_to_interpolate = ['call', 'sms', 'activity']
train_df[columns_to_interpolate] = train_df.groupby('id')[columns_to_interpolate].transform(lambda group: group.interpolate(method='linear', limit_direction='forward', axis=0))
#test_df[columns_to_interpolate] = test_df.groupby('id')[columns_to_interpolate].transform(lambda group: group.interpolate(method='linear', limit_direction='forward', axis=0))

train_df

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-26,0.090464,-0.250000,0.75,6.25,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.0,,2.000000
1,AS14.01,2014-03-20,0.081548,,,,121.466,980.911,42.593,0.000,...,0.000,11.345,621.094,0.000,45.173,21.074,0.000,1.0,128.843000,1.666667
2,AS14.01,2014-03-21,0.134050,0.200000,0.20,6.20,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.000,229.138,0.000,6.0,2867.963000,1.333333
3,AS14.01,2014-03-22,0.236880,0.600000,0.50,6.40,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.000,53.188,0.000,3.0,3038.464000,1.000000
4,AS14.01,2014-03-23,0.142741,0.200000,0.80,6.80,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.000,30.086,30.386,6.5,2780.277001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,AS14.33,2014-05-26,0.115201,-0.400000,0.00,5.40,571.902,750.571,79.963,0.000,...,0.000,140.446,1508.191,0.000,0.000,0.000,0.000,10.0,1118.782999,3.000000
1244,AS14.33,2014-05-27,0.012704,-0.600000,0.40,6.20,269.550,771.182,32.137,0.000,...,0.000,57.535,1252.079,0.000,0.000,56.173,0.000,1.0,1508.516001,2.000000
1245,AS14.33,2014-05-28,0.103301,0.000000,1.20,8.20,1093.741,1057.568,153.121,0.000,...,38.262,208.962,1817.695,0.000,0.000,30.666,0.000,10.0,3414.742998,1.000000
1246,AS14.33,2014-05-29,0.169354,-1.333333,1.00,7.00,248.063,198.112,5.018,0.000,...,0.000,29.202,377.179,0.939,0.000,3.199,0.000,5.0,395.046000,1.000000


In [20]:
columns_to_check = ['mood']


train_df_cleaned = train_df.dropna(subset=columns_to_check)
#test_df = test_df.dropna(subset=columns_to_check)
train_df_cleaned

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-26,0.090464,-0.250000,0.75,6.25,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.0,,2.000000
2,AS14.01,2014-03-21,0.134050,0.200000,0.20,6.20,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.000,229.138,0.000,6.0,2867.963000,1.333333
3,AS14.01,2014-03-22,0.236880,0.600000,0.50,6.40,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.000,53.188,0.000,3.0,3038.464000,1.000000
4,AS14.01,2014-03-23,0.142741,0.200000,0.80,6.80,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.000,30.086,30.386,6.5,2780.277001,1.000000
5,AS14.01,2014-03-24,0.078961,0.800000,0.00,6.00,731.456,5070.628,70.289,34.106,...,3.010,66.558,1362.785,35.005,0.000,10.064,0.000,10.0,5953.697001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,AS14.33,2014-05-26,0.115201,-0.400000,0.00,5.40,571.902,750.571,79.963,0.000,...,0.000,140.446,1508.191,0.000,0.000,0.000,0.000,10.0,1118.782999,3.000000
1244,AS14.33,2014-05-27,0.012704,-0.600000,0.40,6.20,269.550,771.182,32.137,0.000,...,0.000,57.535,1252.079,0.000,0.000,56.173,0.000,1.0,1508.516001,2.000000
1245,AS14.33,2014-05-28,0.103301,0.000000,1.20,8.20,1093.741,1057.568,153.121,0.000,...,38.262,208.962,1817.695,0.000,0.000,30.666,0.000,10.0,3414.742998,1.000000
1246,AS14.33,2014-05-29,0.169354,-1.333333,1.00,7.00,248.063,198.112,5.018,0.000,...,0.000,29.202,377.179,0.939,0.000,3.199,0.000,5.0,395.046000,1.000000


In [21]:
not_null_columns = [
    "circumplex.valence"
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in not_null_columns if col in train_df_cleaned.columns]

# Replace NaN values with 0 for the specified app category columns
train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)
#test_df[existing_app_columns] = test_df[existing_app_columns].fillna(0)
train_df_cleaned


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)


variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-26,0.090464,-0.250000,0.75,6.25,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.0,,2.000000
2,AS14.01,2014-03-21,0.134050,0.200000,0.20,6.20,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.000,229.138,0.000,6.0,2867.963000,1.333333
3,AS14.01,2014-03-22,0.236880,0.600000,0.50,6.40,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.000,53.188,0.000,3.0,3038.464000,1.000000
4,AS14.01,2014-03-23,0.142741,0.200000,0.80,6.80,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.000,30.086,30.386,6.5,2780.277001,1.000000
5,AS14.01,2014-03-24,0.078961,0.800000,0.00,6.00,731.456,5070.628,70.289,34.106,...,3.010,66.558,1362.785,35.005,0.000,10.064,0.000,10.0,5953.697001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,AS14.33,2014-05-26,0.115201,-0.400000,0.00,5.40,571.902,750.571,79.963,0.000,...,0.000,140.446,1508.191,0.000,0.000,0.000,0.000,10.0,1118.782999,3.000000
1244,AS14.33,2014-05-27,0.012704,-0.600000,0.40,6.20,269.550,771.182,32.137,0.000,...,0.000,57.535,1252.079,0.000,0.000,56.173,0.000,1.0,1508.516001,2.000000
1245,AS14.33,2014-05-28,0.103301,0.000000,1.20,8.20,1093.741,1057.568,153.121,0.000,...,38.262,208.962,1817.695,0.000,0.000,30.666,0.000,10.0,3414.742998,1.000000
1246,AS14.33,2014-05-29,0.169354,-1.333333,1.00,7.00,248.063,198.112,5.018,0.000,...,0.000,29.202,377.179,0.939,0.000,3.199,0.000,5.0,395.046000,1.000000


In [22]:
app_categories_columns = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather",
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in app_categories_columns if col in train_df_cleaned.columns]

# Replace NaN values with 0 for the specified app category columns
train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)
#test_df[existing_app_columns] = test_df[existing_app_columns].fillna(0)
train_df_cleaned


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)


variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-26,0.090464,-0.250000,0.75,6.25,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.0,,2.000000
2,AS14.01,2014-03-21,0.134050,0.200000,0.20,6.20,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.000,229.138,0.000,6.0,2867.963000,1.333333
3,AS14.01,2014-03-22,0.236880,0.600000,0.50,6.40,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.000,53.188,0.000,3.0,3038.464000,1.000000
4,AS14.01,2014-03-23,0.142741,0.200000,0.80,6.80,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.000,30.086,30.386,6.5,2780.277001,1.000000
5,AS14.01,2014-03-24,0.078961,0.800000,0.00,6.00,731.456,5070.628,70.289,34.106,...,3.010,66.558,1362.785,35.005,0.000,10.064,0.000,10.0,5953.697001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,AS14.33,2014-05-26,0.115201,-0.400000,0.00,5.40,571.902,750.571,79.963,0.000,...,0.000,140.446,1508.191,0.000,0.000,0.000,0.000,10.0,1118.782999,3.000000
1244,AS14.33,2014-05-27,0.012704,-0.600000,0.40,6.20,269.550,771.182,32.137,0.000,...,0.000,57.535,1252.079,0.000,0.000,56.173,0.000,1.0,1508.516001,2.000000
1245,AS14.33,2014-05-28,0.103301,0.000000,1.20,8.20,1093.741,1057.568,153.121,0.000,...,38.262,208.962,1817.695,0.000,0.000,30.666,0.000,10.0,3414.742998,1.000000
1246,AS14.33,2014-05-29,0.169354,-1.333333,1.00,7.00,248.063,198.112,5.018,0.000,...,0.000,29.202,377.179,0.939,0.000,3.199,0.000,5.0,395.046000,1.000000


In [23]:
columns_to_sum = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather"
]

# Use apply to iterate over rows for rows where 'screen' is NaN
train_df_cleaned.loc[train_df_cleaned['screen'].isna(), 'screen'] = train_df_cleaned[train_df_cleaned['screen'].isna()].apply(
    lambda row: row[columns_to_sum].sum(), axis=1)
#test_df.loc[test_df['screen'].isna(), 'screen'] = test_df[test_df['screen'].isna()].apply(
    #lambda row: row[columns_to_sum].sum(), axis=1)

train_df_cleaned

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
0,AS14.01,2014-02-26,0.090464,-0.250000,0.75,6.25,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.0,0.000000,2.000000
2,AS14.01,2014-03-21,0.134050,0.200000,0.20,6.20,1090.617,2850.042,100.898,49.544,...,5.479,50.465,3121.747,167.373,0.000,229.138,0.000,6.0,2867.963000,1.333333
3,AS14.01,2014-03-22,0.236880,0.600000,0.50,6.40,476.737,1946.683,4.010,21.076,...,0.000,60.565,439.632,37.305,0.000,53.188,0.000,3.0,3038.464000,1.000000
4,AS14.01,2014-03-23,0.142741,0.200000,0.80,6.80,540.915,2680.535,4.028,43.403,...,0.000,40.604,649.414,0.000,0.000,30.086,30.386,6.5,2780.277001,1.000000
5,AS14.01,2014-03-24,0.078961,0.800000,0.00,6.00,731.456,5070.628,70.289,34.106,...,3.010,66.558,1362.785,35.005,0.000,10.064,0.000,10.0,5953.697001,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,AS14.33,2014-05-26,0.115201,-0.400000,0.00,5.40,571.902,750.571,79.963,0.000,...,0.000,140.446,1508.191,0.000,0.000,0.000,0.000,10.0,1118.782999,3.000000
1244,AS14.33,2014-05-27,0.012704,-0.600000,0.40,6.20,269.550,771.182,32.137,0.000,...,0.000,57.535,1252.079,0.000,0.000,56.173,0.000,1.0,1508.516001,2.000000
1245,AS14.33,2014-05-28,0.103301,0.000000,1.20,8.20,1093.741,1057.568,153.121,0.000,...,38.262,208.962,1817.695,0.000,0.000,30.666,0.000,10.0,3414.742998,1.000000
1246,AS14.33,2014-05-29,0.169354,-1.333333,1.00,7.00,248.063,198.112,5.018,0.000,...,0.000,29.202,377.179,0.939,0.000,3.199,0.000,5.0,395.046000,1.000000


# Feature Engineering

In [24]:
app_columns = [col for col in train_df_cleaned.columns if 'appCat' in col] 
train_df_cleaned['total_app_usage'] = train_df_cleaned[app_columns].sum(axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['total_app_usage'] = train_df_cleaned[app_columns].sum(axis=1)


In [25]:
import pandas as pd

# Assuming train_df_cleaned is already loaded and prepared
# Define weights for each category - adjust these based on your specific requirements
weights = {
    'appCat.finance': 0.5,      # Weight for finance apps
    'appCat.office': 0.5,       # Weight for office apps
    'appCat.communication': 0.5,# Weight for communication apps
    'appCat.social': 0.5,       # Weight for social apps
    'appCat.entertainment': 0.5 # Weight for entertainment apps
}

# Calculate weighted sums
train_df_cleaned['productivity_attribute'] = (
    train_df_cleaned['appCat.finance'] * weights['appCat.finance'] +
    train_df_cleaned['appCat.office'] * weights['appCat.office']
)

train_df_cleaned['social_app'] = (
    train_df_cleaned['appCat.communication'] * weights['appCat.communication'] +
    train_df_cleaned['appCat.social'] * weights['appCat.social']
)

# Calculate weighted ratio for productivity to social apps
# Adding 1 to avoid division by zero in case totals are zero
train_df_cleaned['productivity_to_social_app_ratio'] = (
    (train_df_cleaned['appCat.finance'] * weights['appCat.finance'] +
    train_df_cleaned['appCat.office'] * weights['appCat.office'] + 1) /
    (train_df_cleaned['appCat.entertainment'] * weights['appCat.entertainment'] +
    train_df_cleaned['appCat.social'] * weights['appCat.social'] + 1)
)

# Assuming sms and call are equally important for 'social_phone'
train_df_cleaned['social_phone'] = (
    train_df_cleaned['sms'] * 0.5 +  # Assuming equal weight for sms
    train_df_cleaned['call'] * 0.5   # Assuming equal weight for calls
)

# Display the modified DataFrame
train_df_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['productivity_attribute'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['social_app'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['productivity_to_social_app_ratio'] = (
A value is trying to be set on a copy of a slice from a DataFrame.

variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.utilities,appCat.weather,call,screen,sms,total_app_usage,productivity_attribute,social_app,productivity_to_social_app_ratio,social_phone
0,AS14.01,2014-02-26,0.090464,-0.25,0.75,6.25,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,1.5
2,AS14.01,2014-03-21,0.13405,0.2,0.2,6.2,1090.617,2850.042,100.898,49.544,...,229.138,0.0,6.0,2867.963,1.333333,7665.303,27.5115,2985.8945,0.017683,3.666667
3,AS14.01,2014-03-22,0.23688,0.6,0.5,6.4,476.737,1946.683,4.01,21.076,...,53.188,0.0,3.0,3038.464,1.0,3039.196,10.538,1193.1575,0.051781,2.0
4,AS14.01,2014-03-23,0.142741,0.2,0.8,6.8,540.915,2680.535,4.028,43.403,...,30.086,30.386,6.5,2780.277001,1.0,4019.371,21.7015,1664.9745,0.069271,3.75
5,AS14.01,2014-03-24,0.078961,0.8,0.0,6.0,731.456,5070.628,70.289,34.106,...,10.064,0.0,10.0,5953.697001,1.0,7383.901,18.558,3216.7065,0.027257,5.5


In [26]:
#stop

In [27]:
# Extract the list of unique IDs
#unique_ids = train_df_cleaned['id'].unique()

# Create a DataFrame for each unique ID and store in a dictionary
#dataframes_by_id = {id_: train_df_cleaned[train_df_cleaned['id'] == id_] for id_ in unique_ids}

#len(unique_ids)

#id1 = dataframes_by_id[unique_ids[0]].sort_values(by='date',ascending=False)
#id1['date'] = pd.to_datetime(id1['date'])

# Sort data by date
#data_sorted = id1.sort_values('date')

# Calculate and shift both the 3-day and 7-day moving averages by one day
#moving_averages_3days_shifted = data_sorted.set_index('date').rolling(window=3).mean().shift(1)
#moving_averages_5days_shifted = data_sorted.set_index('date').rolling(window=5).mean().shift(1)
#moving_averages_7days_shifted = data_sorted.set_index('date').rolling(window=7).mean().shift(1)

# Add suffixes to identify the columns correctly
#moving_averages_3days_shifted.columns = [f"{col}_3day_avg" for col in moving_averages_3days_shifted.columns]
#moving_averages_5days_shifted.columns = [f"{col}_5day_avg" for col in moving_averages_7days_shifted.columns]
#moving_averages_7days_shifted.columns = [f"{col}_7day_avg" for col in moving_averages_7days_shifted.columns]

# Concatenate both shifted averages with the original dataset
#data_with_corrected_averages = pd.concat(
#    [data_sorted.set_index('date'), moving_averages_3days_shifted,moving_averages_5days_shifted, moving_averages_7days_shifted],
 #   axis=1
#).reset_index()
#moving_averages.reset_index()

#v1 = data_with_corrected_averages[['activity', 'activity_3day_avg', 'activity_5day_avg', 'activity_7day_avg']].head(20)
#v2 = corrected_grouped_data[['activity', 'activity_3day_avg', 'activity_5day_avg', 'activity_7day_avg']].head(20)
#print(v1.equals(v2))

## Agg Variables

In [28]:
# Assuming 'df' is your DataFrame and 'grade' is your continuous variable
train_df_cleaned['mood_quantiles'] = pd.qcut(train_df_cleaned['mood'], q=3, labels=['Q1', 'Q2', 'Q3'])
#test_df['mood_quantiles'] = pd.qcut(test_df['mood'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
train_df_cleaned


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['mood_quantiles'] = pd.qcut(train_df_cleaned['mood'], q=3, labels=['Q1', 'Q2', 'Q3'])


variable,id,date,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.weather,call,screen,sms,total_app_usage,productivity_attribute,social_app,productivity_to_social_app_ratio,social_phone,mood_quantiles
0,AS14.01,2014-02-26,0.090464,-0.250000,0.75,6.25,0.000,0.000,0.000,0.000,...,0.000,1.0,0.000000,2.000000,0.000,0.0000,0.0000,1.000000,1.500000,Q1
2,AS14.01,2014-03-21,0.134050,0.200000,0.20,6.20,1090.617,2850.042,100.898,49.544,...,0.000,6.0,2867.963000,1.333333,7665.303,27.5115,2985.8945,0.017683,3.666667,Q1
3,AS14.01,2014-03-22,0.236880,0.600000,0.50,6.40,476.737,1946.683,4.010,21.076,...,0.000,3.0,3038.464000,1.000000,3039.196,10.5380,1193.1575,0.051781,2.000000,Q1
4,AS14.01,2014-03-23,0.142741,0.200000,0.80,6.80,540.915,2680.535,4.028,43.403,...,30.386,6.5,2780.277001,1.000000,4019.371,21.7015,1664.9745,0.069271,3.750000,Q1
5,AS14.01,2014-03-24,0.078961,0.800000,0.00,6.00,731.456,5070.628,70.289,34.106,...,0.000,10.0,5953.697001,1.000000,7383.901,18.5580,3216.7065,0.027257,5.500000,Q1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,AS14.33,2014-05-26,0.115201,-0.400000,0.00,5.40,571.902,750.571,79.963,0.000,...,0.000,10.0,1118.782999,3.000000,3051.073,0.0000,1129.3810,0.001258,6.500000,Q1
1244,AS14.33,2014-05-27,0.012704,-0.600000,0.40,6.20,269.550,771.182,32.137,0.000,...,0.000,1.0,1508.516001,2.000000,2438.656,0.0000,1011.6305,0.001555,1.500000,Q1
1245,AS14.33,2014-05-28,0.103301,0.000000,1.20,8.20,1093.741,1057.568,153.121,0.000,...,0.000,10.0,3414.742998,1.000000,4400.015,19.1310,1437.6315,0.020408,5.500000,Q3
1246,AS14.33,2014-05-29,0.169354,-1.333333,1.00,7.00,248.063,198.112,5.018,0.000,...,0.000,5.0,395.046000,1.000000,861.712,0.0000,287.6455,0.005206,3.000000,Q2


In [29]:
 # Convert string labels to numeric labels
label_encoder = LabelEncoder()
train_df_cleaned['mood_quantiles'] = label_encoder.fit_transform(train_df_cleaned['mood_quantiles'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['mood_quantiles'] = label_encoder.fit_transform(train_df_cleaned['mood_quantiles'])


In [30]:
import pandas as pd

def calculate_moving_averages(df):
    # Set 'date' as the index for rolling calculations
    df = df.set_index('date')
    
    # Calculate moving averages with shifting to avoid data leakage
    ma_3days = df.rolling(window=3).mean().shift(1)
    #ma_5days = df.rolling(window=5).mean().shift(1)
    ma_7days = df.rolling(window=7).mean().shift(1)
    
    # Add suffixes to identify the columns for each moving average
    ma_3days.columns = [f"{col}_3day_avg" for col in ma_3days.columns]
    #ma_5days.columns = [f"{col}_5day_avg" for col in ma_5days.columns]
    ma_7days.columns = [f"{col}_7day_avg" for col in ma_7days.columns]
    
    # Concatenate the original data with the moving averages
    result = pd.concat([df, ma_3days, ma_7days], axis=1)
    
    # Handle NaN values: forward fill first, then backward fill
    result.ffill(inplace=True)
    result.bfill(inplace=True)
    
    return result.reset_index()  # Reset the index to bring 'date' back to a column

# Assuming 'train_df_cleaned' is your DataFrame and 'id' is the group identifier
corrected_grouped_data = train_df_cleaned.groupby('id').apply(calculate_moving_averages).reset_index(drop=True)


  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)


  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)


  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)
  ma_3days = df.rolling(window=3).mean().shift(1)
  ma_7days = df.rolling(window=7).mean().shift(1)


In [31]:
corrected_grouped_data

Unnamed: 0,date,id,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,...,appCat.weather_7day_avg,call_7day_avg,screen_7day_avg,sms_7day_avg,total_app_usage_7day_avg,productivity_attribute_7day_avg,social_app_7day_avg,productivity_to_social_app_ratio_7day_avg,social_phone_7day_avg,mood_quantiles_7day_avg
0,2014-02-26,AS14.01,0.090464,-0.250000,0.75,6.25,0.000,0.000,0.000,0.000,...,4.340857,5.500000,3298.692857,1.190476,4946.587286,18.000214,2072.657286,0.176490,3.345238,0.000000
1,2014-03-21,AS14.01,0.134050,0.200000,0.20,6.20,1090.617,2850.042,100.898,49.544,...,4.340857,5.500000,3298.692857,1.190476,4946.587286,18.000214,2072.657286,0.176490,3.345238,0.000000
2,2014-03-22,AS14.01,0.236880,0.600000,0.50,6.40,476.737,1946.683,4.010,21.076,...,4.340857,5.500000,3298.692857,1.190476,4946.587286,18.000214,2072.657286,0.176490,3.345238,0.000000
3,2014-03-23,AS14.01,0.142741,0.200000,0.80,6.80,540.915,2680.535,4.028,43.403,...,4.340857,5.500000,3298.692857,1.190476,4946.587286,18.000214,2072.657286,0.176490,3.345238,0.000000
4,2014-03-24,AS14.01,0.078961,0.800000,0.00,6.00,731.456,5070.628,70.289,34.106,...,4.340857,5.500000,3298.692857,1.190476,4946.587286,18.000214,2072.657286,0.176490,3.345238,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1207,2014-05-26,AS14.33,0.115201,-0.400000,0.00,5.40,571.902,750.571,79.963,0.000,...,0.000000,3.385714,1288.826286,1.000000,2567.658857,0.000000,1066.217214,0.001597,2.192857,0.000000
1208,2014-05-27,AS14.33,0.012704,-0.600000,0.40,6.20,269.550,771.182,32.137,0.000,...,0.000000,4.300000,1342.314428,1.285714,2797.069857,0.000000,1144.606214,0.001498,2.792857,0.000000
1209,2014-05-28,AS14.33,0.103301,0.000000,1.20,8.20,1093.741,1057.568,153.121,0.000,...,0.000000,3.814286,1487.639000,1.428571,2943.002857,0.000000,1209.835786,0.001414,2.621429,0.000000
1210,2014-05-29,AS14.33,0.169354,-1.333333,1.00,7.00,248.063,198.112,5.018,0.000,...,0.000000,4.500000,1823.659142,1.428571,3266.978143,2.733000,1291.393500,0.004150,2.964286,0.285714


## Class Bins

In [32]:
corrected_grouped_data.to_csv('t1.csv')

In [33]:
stop

NameError: name 'stop' is not defined

# Scale

# Tried something with the splits with my last two brain cells, but i think it get's too messy and might get more noise. I used normal cv for feature selection and then tscv with the final model, the documentation says the function should insure there is no leakage :/ - Checl th Classification notebook

# TimeSplit && Try interpolation during splitting

In [None]:
 # Convert string labels to numeric labels
label_encoder = LabelEncoder()
corrected_grouped_data['mood_quantiles'] = label_encoder.fit_transform(corrected_grouped_data['mood_quantiles'])

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assume data is loaded into corrected_grouped_data
corrected_grouped_data['date'] = pd.to_datetime(corrected_grouped_data['date'])
corrected_grouped_data_sorted = corrected_grouped_data.sort_values('date')

# Setup TimeSeriesSplit with 5 splits
tscv = TimeSeriesSplit(n_splits=5)
target_variable = 'mood_quantiles'  # Ensure this is just the column name

# Perform processing
for train_index, test_index in tscv.split(corrected_grouped_data_sorted):
    train_data = corrected_grouped_data_sorted.iloc[train_index]
    test_data = corrected_grouped_data_sorted.iloc[test_index]

    # Fill NaNs using ffill and bfill
    train_data_filled = train_data.groupby('id').apply(lambda group: group.ffill().bfill()).reset_index(drop=True)
    test_data_filled = test_data.groupby('id').apply(lambda group: group.ffill().bfill()).reset_index(drop=True)

    # Fill remaining NaNs with median values from train data
    median_values = train_data_filled.mean()
    train_data_filled.fillna(median_values, inplace=True)
    test_data_filled.fillna(median_values, inplace=True)

    # Replace inf values with NaN, then fill them
    train_data_filled.replace([np.inf, -np.inf], np.nan, inplace=True)
    test_data_filled.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Final NaN fill if necessary
    train_data_filled.fillna(median_values, inplace=True)
    test_data_filled.fillna(median_values, inplace=True)

    # Ensure no NaNs or infs before moving to VIF calculation
    assert not train_data_filled.isnull().values.any()
    assert not test_data_filled.isnull().values.any()
    assert np.isfinite(train_data_filled.select_dtypes(include=[np.number])).all().all()
    assert np.isfinite(test_data_filled.select_dtypes(include=[np.number])).all().all()

    # Proceed to split into X and y, and further processing


In [None]:
#for i in range(len(unique_ids)):
    #print("\nExample DataFrame for ID",dataframes_by_id[unique_ids[i]].shape)

In [None]:
X_train.isnull().sum().sum(), X_test.isnull().sum().sum(), y_train.isnull().sum().sum(), y_test.isnull().sum().sum()

In [None]:
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

# Feature Selection

## RF feature importance



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Initializing and fitting a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Extracting feature importance
feature_importances = rf.feature_importances_

# Combining feature names and their importances into a DataFrame
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the top 10 most important features
importance_df.head(20)

## Boruta - tree based FS

In [None]:
from boruta import BorutaPy
rf =  RandomForestClassifier(n_estimators=100, max_depth=5)

feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

feat_selector.fit(X_train.values, y_train)

print("\n------Support and Ranking for each feature------")
for i in range(len(feat_selector.support_)):
    if feat_selector.support_[i]:
        print("Passes the test: ", X_train.columns[i],
              " - Ranking: ", feat_selector.ranking_[i])
    else:
        print("Doesn't pass the test: ",
              X_train.columns[i], " - Ranking: ", feat_selector.ranking_[i])

In [None]:
selected_rfe_features = pd.DataFrame({'Feature':list(X_train.columns),
'Ranking':feat_selector.ranking_})
selected_rfe_features.sort_values(by='Ranking')

In [None]:
import pandas as pd

# Assuming 'X' is your DataFrame of features and 'feat_selector' is your fitted BorutaPy object
confirmed_features = X.columns[feat_selector.support_].tolist()
tentative_features = X.columns[feat_selector.support_weak_].tolist()

# Combine confirmed and tentative features
boruta_features = confirmed_features 
#+ tentative_features

print("Confirmed Features:", confirmed_features)
print("Tentative Features:", tentative_features)
print("All Relevant Features (Confirmed + Tentative):", boruta_features)


In [None]:
X_train_b = X_train[boruta_features]
X_test_b = X_test[boruta_features]

In [None]:
X_l = X[boruta_features]

## BS

In [None]:
import statsmodels.api as sm
def backward_elimination(data, target, significance_level=0.05):
    features = data.columns.tolist()
    while len(features) > 0:
        features_with_constant = sm.add_constant(data[features])
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]  # Exclude intercept
        max_p_value = p_values.max()
        if max_p_value >= significance_level:
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
            #print("We remove "+ excluded_feature)
        else:
            break
    return features

selected_features = backward_elimination(X, y)
print(selected_features)

In [None]:
X_bs = X[['activity', 'circumplex.valence', 'appCat.social_3day_avg', 'sms_3day_avg', 'social_app_5day_avg', 'social_phone_5day_avg', 'activity_7day_avg', 'circumplex.arousal_7day_avg', 'circumplex.valence_7day_avg', 'mood_7day_avg', 'appCat.office_7day_avg', 'call_7day_avg', 'screen_7day_avg']]

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, precision_score, recall_score, auc, roc_curve, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from statsmodels.stats.outliers_influence import variance_inflation_factor

best_score = float('-inf')
best_model = None

tscv = TimeSeriesSplit(n_splits=5)

grid_models = [
    (DecisionTreeClassifier(), [{'criterion': ['gini', 'entropy'], 'random_state': [0]}]),
    (RandomForestClassifier(), [{'n_estimators': [100, 150, 200], 'criterion': ['gini', 'entropy'], 'random_state': [0]}]),
    (AdaBoostClassifier(), [{'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.5, 0.8, 1], 'random_state': [0]}]),
    (GradientBoostingClassifier(), [{'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.5, 0.8, 1], 'random_state': [0]}]),
    (XGBClassifier(), [{'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [100, 150, 200], 'random_state': [0], 'eval_metric': ['logloss', 'error']}])
]


best_score = float('-inf')
best_model = None

for model, params in grid_models:
    grid = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=tscv, n_jobs=-1, verbose=1)
    grid.fit(X_bs, y)  # Make sure to use the full dataset X and y_encoded
    current_score = grid.best_score_
    print('{}:\nBest Score : {:.2f}%'.format(model.__class__.__name__, current_score * 100))
    print('Best Parameters : ', grid.best_params_)
    print('----------------\n')
    
    if current_score > best_score:
        best_score = current_score
        best_model = grid.best_estimator_

print('Best model overall:', best_model)
