In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("dataset_mood_smartphone.csv")
df['time'] = pd.to_datetime(df['time'])
df['date'] = df['time'].dt.date
df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)

df

In [3]:
df_filtered = df[df['variable'].isin(['sms', 'call'])]

# Aggregate counts by individual and event type
counts = df_filtered.groupby(['id', 'variable'])['value'].size().reset_index(name='count')

# Determine the IQR for outlier detection
Q1 = counts['count'].quantile(0.25)
Q3 = counts['count'].quantile(0.75)
IQR = Q3 - Q1
outliers_threshold_low = Q1 - 1.5 * IQR
outliers_threshold_high = Q3 + 1.5 * IQR

# Identify outliers
outliers = counts[(counts['count'] < outliers_threshold_low) | (counts['count'] > outliers_threshold_high)]

In [4]:
# Filter for 'sms' and 'call' events
df_sms_call = df[df['variable'].isin(['sms', 'call'])]

# Group by individual ('id'), variable, and date to count daily events
daily_counts = df_sms_call.groupby(['id', 'variable', df_sms_call['time'].dt.date]).size().reset_index(name='daily_count')


In [5]:
# Calculate IQR for daily counts
Q1 = daily_counts['daily_count'].quantile(0.25)
Q3 = daily_counts['daily_count'].quantile(0.75)
IQR = Q3 - Q1

# Define outliers as those beyond 1.5 times the IQR from the quartiles
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outlier rows based on daily counts
outliers = daily_counts[(daily_counts['daily_count'] < lower_bound) | (daily_counts['daily_count'] > upper_bound)]


In [6]:
nan_rows_arousal = df[(df['variable'] == 'circumplex.arousal') & (df['value'].isna())].index
nan_rows_valence = df[(df['variable'] == 'circumplex.valence') & (df['value'].isna())].index
nan_rows_activity = df[(df['variable'] == 'activity') & (df['value'].isna())].index


# Combine the indices of rows with NaN values for arousal and valence
nan_rows_combined = nan_rows_arousal.union(nan_rows_valence)

In [7]:
# List of variables/categories to check for negative values, excluding mood, arousal, and valence
variables_to_check = [variable for variable in df['variable'].unique() if variable not in ['mood', 'circumplex.arousal', 'circumplex.valence']]

# Check for negative values in the remaining variables
negative_values_check = {variable: (df[df['variable'] == variable]['value'] < 0).any() for variable in variables_to_check}

negative_values_check

# Identify rows with negative values in appCat.builtin and appCat.entertainment in the original dataset
negative_values_builtin = df[(df['variable'] == 'appCat.builtin') & (df['value'] < 0)].index
negative_values_entertainment = df[(df['variable'] == 'appCat.entertainment') & (df['value'] < 0)].index

#neg combined 
neg = negative_values_builtin.union(negative_values_entertainment)
# Combine the indices of rows with negative values for appCat.builtin and appCat.entertainment
# with previously identified NaN rows for removal
remove_combined = nan_rows_combined.union(negative_values_builtin).union(negative_values_entertainment)

df_negative = df.loc[neg]

In [8]:
combined = df.loc[remove_combined]
combined
df1 = df.drop(combined.index)

In [9]:
# Initialize a DataFrame to store outliers
outliers_df = pd.DataFrame()

variables_for_box_plots_all = [
    'screen', 'appCat.builtin', 'appCat.communication', 'appCat.entertainment',
    'appCat.finance', 'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social',
    'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather'
]
# Iterate over each variable to remove outliers, saving them first
for variable in variables_for_box_plots_all:
    # Isolate the current variable's data
    var_df = df1[df1['variable'] == variable]
    
    # Calculate IQR and determine bounds for outliers
    Q1 = var_df['value'].quantile(0.25)
    Q3 = var_df['value'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identifying outliers
    outliers = var_df[(var_df['value'] < lower_bound) | (var_df['value'] > upper_bound)]
    
    # Append outliers to the outliers_df DataFrame
    outliers_df = pd.concat([outliers_df, outliers], ignore_index=True)
    
    # Identifying indexes of rows that are not outliers to keep in the original dataframe
    non_outliers_index = var_df[(var_df['value'] >= lower_bound) & (var_df['value'] <= upper_bound)].index
    
    # Update df to only include rows that are not outliers for the current variable
    df1 = df1[(df1.index.isin(non_outliers_index)) | (df1['variable'] != variable)]


In [10]:
score_variables = ["mood", "circumplex.arousal", "circumplex.valence", "activity"]

# Creating a dataset with only the selected variables
df_score = df1[df1['variable'].isin(score_variables)]

# Creating another dataset with the rest of the variables
df_machine = df1[~df1['variable'].isin(score_variables)]

df_score['date'] = df_score['time'].dt.date
df_machine['date'] = df_machine['time'].dt.date


In [11]:
df_scores_daily = df_score.drop('time', axis = 1)
df_machine_daily = df_machine.drop('time', axis = 1)

In [12]:
grouped_scores = df_scores_daily.groupby(['id', 'date', 'variable'])['value'].mean().reset_index()
grouped_scores
grouped_times = df_machine_daily.groupby(['id', 'date', 'variable'])['value'].sum().reset_index()
grouped_times

In [13]:
scores_pivot_df = df_scores_daily.pivot_table(index=['id','date'], columns='variable', values='value', aggfunc='mean').reset_index()
scores_pivot_df

In [14]:
time_pivot_df = df_machine_daily.pivot_table(index=['id','date'], columns='variable', values='value', aggfunc='sum').reset_index()

In [15]:
app_categories_columns = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather"
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in app_categories_columns if col in time_pivot_df.columns]

# Replace NaN values with 0 for the specified app category columns
time_pivot_df[existing_app_columns] = time_pivot_df[existing_app_columns].fillna(0)
time_pivot_df



In [16]:
merged_df = pd.merge(scores_pivot_df, time_pivot_df, on=['id','date'], how='inner')
merged_df

NOW SPLIT INTO TRAIN TEST SETS, THEN ADD ALL INTERPOLATION STUFF TO BOTH

In [17]:
#tscv = TimeSeriesSplit(n_splits=2) #ONLY 1 SPLIT = change to 5 for kfoldcross
train_df = merged_df

In [18]:
columns_to_interpolate = ['call',  'sms']
train_df[columns_to_interpolate] = train_df[columns_to_interpolate].interpolate(method='linear', limit_direction='forward', axis=0)
mean_activity_per_id = train_df.groupby('id')['activity'].mean()
train_df['activity'] = train_df.apply(
    lambda row: mean_activity_per_id[row['id']] if pd.isna(row['activity']) else row['activity'],
    axis=1
)
#test_df[columns_to_interpolate] = test_df[columns_to_interpolate].interpolate(method='linear', limit_direction='forward', axis=0)
train_df

In [19]:
columns_to_interpolate = ['call', 'sms', 'activity']
train_df[columns_to_interpolate] = train_df.groupby('id')[columns_to_interpolate].transform(lambda group: group.interpolate(method='linear', limit_direction='forward', axis=0))
#test_df[columns_to_interpolate] = test_df.groupby('id')[columns_to_interpolate].transform(lambda group: group.interpolate(method='linear', limit_direction='forward', axis=0))

train_df

In [20]:
columns_to_check = ['mood']


train_df_cleaned = train_df.dropna(subset=columns_to_check)
#test_df = test_df.dropna(subset=columns_to_check)
train_df_cleaned

In [21]:
not_null_columns = [
    "circumplex.valence"
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in not_null_columns if col in train_df_cleaned.columns]

# Replace NaN values with 0 for the specified app category columns
train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)
#test_df[existing_app_columns] = test_df[existing_app_columns].fillna(0)
train_df_cleaned


In [22]:
app_categories_columns = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather",
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in app_categories_columns if col in train_df_cleaned.columns]

# Replace NaN values with 0 for the specified app category columns
train_df_cleaned[existing_app_columns] = train_df_cleaned[existing_app_columns].fillna(0)
#test_df[existing_app_columns] = test_df[existing_app_columns].fillna(0)
train_df_cleaned


In [23]:
columns_to_sum = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather"
]

# Use apply to iterate over rows for rows where 'screen' is NaN
train_df_cleaned.loc[train_df_cleaned['screen'].isna(), 'screen'] = train_df_cleaned[train_df_cleaned['screen'].isna()].apply(
    lambda row: row[columns_to_sum].sum(), axis=1)
#test_df.loc[test_df['screen'].isna(), 'screen'] = test_df[test_df['screen'].isna()].apply(
    #lambda row: row[columns_to_sum].sum(), axis=1)

train_df_cleaned

# Feature Engineering

In [24]:
app_columns = [col for col in train_df_cleaned.columns if 'appCat' in col] 
train_df_cleaned['total_app_usage'] = train_df_cleaned[app_columns].sum(axis=1)


In [25]:
import pandas as pd

# Assuming train_df_cleaned is already loaded and prepared
# Define weights for each category - adjust these based on your specific requirements
weights = {
    'appCat.finance': 0.5,      # Weight for finance apps
    'appCat.office': 0.5,       # Weight for office apps
    'appCat.communication': 0.5,# Weight for communication apps
    'appCat.social': 0.5,       # Weight for social apps
    'appCat.entertainment': 0.5 # Weight for entertainment apps
}

import pandas as pd

# Assuming 'dff' is your DataFrame
# Identify columns related to app usage
app_columns = [col for col in train_df_cleaned.columns if 'appCat.' in col]

# Sum these columns to get a total time spent on app components
train_df_cleaned['total_app_time'] = train_df_cleaned[app_columns].sum(axis=1)

# Now, 'dff' includes a new column 'total_app_time' that has the total time spent on apps for each entry


# Calculate weighted sums
train_df_cleaned['productivity_attribute'] = (
    train_df_cleaned['appCat.finance'] * weights['appCat.finance'] +
    train_df_cleaned['appCat.office'] * weights['appCat.office']
)

train_df_cleaned['social_app'] = (
    train_df_cleaned['appCat.communication'] * weights['appCat.communication'] +
    train_df_cleaned['appCat.social'] * weights['appCat.social']
)

# Calculate weighted ratio for productivity to social apps
# Adding 1 to avoid division by zero in case totals are zero
train_df_cleaned['productivity_to_social_app_ratio'] = (
    (train_df_cleaned['appCat.finance'] * weights['appCat.finance'] +
    train_df_cleaned['appCat.office'] * weights['appCat.office'] + 1) /
    (train_df_cleaned['appCat.entertainment'] * weights['appCat.entertainment'] +
    train_df_cleaned['appCat.social'] * weights['appCat.social'] + 1)
)

# Assuming sms and call are equally important for 'social_phone'
train_df_cleaned['social_phone'] = (
    train_df_cleaned['sms'] * 0.5 +  # Assuming equal weight for sms
    train_df_cleaned['call'] * 0.5   # Assuming equal weight for calls
)

# Display the modified DataFrame
train_df_cleaned.head()


## Agg Variables

In [28]:
# Assuming 'df' is your DataFrame and 'grade' is your continuous variable
train_df_cleaned['mood_quantiles'] = pd.qcut(train_df_cleaned['mood'], q=3, labels=['Q1', 'Q2', 'Q3'])
#test_df['mood_quantiles'] = pd.qcut(test_df['mood'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
train_df_cleaned


In [29]:
 # Convert string labels to numeric labels
label_encoder = LabelEncoder()
train_df_cleaned['mood_quantiles'] = label_encoder.fit_transform(train_df_cleaned['mood_quantiles'])

In [30]:
import pandas as pd

def calculate_moving_averages(df):
    # Set 'date' as the index for rolling calculations
    df = df.set_index('date')
    
    # Calculate moving averages with shifting to avoid data leakage
    ma_3days = df.rolling(window=3).mean().shift(1)
    #ma_5days = df.rolling(window=5).mean().shift(1)
    ma_7days = df.rolling(window=7).mean().shift(1)
    
    # Add suffixes to identify the columns for each moving average
    ma_3days.columns = [f"{col}_3day_avg" for col in ma_3days.columns]
    #ma_5days.columns = [f"{col}_5day_avg" for col in ma_5days.columns]
    ma_7days.columns = [f"{col}_7day_avg" for col in ma_7days.columns]
    
    # Concatenate the original data with the moving averages
    result = pd.concat([df, ma_3days, ma_7days], axis=1)
    
    # Handle NaN values: forward fill first, then backward fill
    result.ffill(inplace=True)
    result.bfill(inplace=True)
    
    return result.reset_index()  # Reset the index to bring 'date' back to a column


train_df_cleaned = train_df_cleaned.drop('mood', axis = 1)
# Assuming 'train_df_cleaned' is your DataFrame and 'id' is the group identifier
corrected_grouped_data = train_df_cleaned.groupby('id').apply(calculate_moving_averages).reset_index(drop=True)


In [31]:
corrected_grouped_data

In [32]:
import pandas as pd
import numpy as np

# Assuming 'data' is your DataFrame and 'date' column has been converted to datetime format already
corrected_grouped_data['date'] = pd.to_datetime(corrected_grouped_data['date'], errors='coerce')

# Boolean encoding for 'is_weekend' (0 for weekdays, 1 for weekends)
corrected_grouped_data['is_weekend'] = corrected_grouped_data['date'].dt.dayofweek >= 5
corrected_grouped_data['is_weekend'] = corrected_grouped_data['is_weekend'].astype(int)  # Convert boolean to integer (0 or 1)

# Extract the day of the week (0=Monday, 6=Sunday)
corrected_grouped_data['day_of_week'] = corrected_grouped_data['date'].dt.dayofweek

# Extract the month from the date (1=January, 12=December)
corrected_grouped_data['month'] = corrected_grouped_data['date'].dt.month

# One-hot encode 'day_of_week' and 'month'
day_of_week_dummies = pd.get_dummies(corrected_grouped_data['day_of_week'], prefix='day')
month_dummies = pd.get_dummies(corrected_grouped_data['month'], prefix='month')

# Concatenate the original data frame with the new one-hot encoded columns
final_data = pd.concat([corrected_grouped_data, day_of_week_dummies, month_dummies], axis=1)

# Now 'data' includes the original columns plus the new one-hot encoded columns for day of the week and month


In [33]:
final_data

## Class Bins

In [34]:
final_data.to_csv('data_class.csv')