In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("dataset_mood_smartphone.csv")

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes

In [None]:
df['time'] = pd.to_datetime(df['time'])
df['date'] = df['time'].dt.date
df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)

df

In [None]:
# Assuming 'df' is your DataFrame that includes a 'variable' column
variable_values_list = df['variable'].unique().tolist()

# Now 'variable_values_list' contains all unique variable values as a list
print(variable_values_list)


# Scores

In [None]:
# Preparing the figure to plot histograms for a subset of variables
fig, axs = plt.subplots(7, 3, figsize=(20, 40))
axs = axs.flatten()  # Flatten the array for easy iteration

# Selecting a manageable subset of variables for histograms
#scores = ['mood', 'circumplex.arousal', 'circumplex.valence','activity']  # Adjust based on how many you wish to plot
variables = [
    'mood', 'circumplex.arousal', 'circumplex.valence', 'activity', 'screen', 'call', 'sms',
    'appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance', 'appCat.game',
    'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.unknown',
    'appCat.utilities', 'appCat.weather'
]

for ax, variable in zip(axs, variables):
    var_df = df[df['variable'] == variable]
    ax.hist(var_df['value'], color='skyblue', edgecolor='black')
    ax.set_title(variable)
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

#plt.tight_layout()
plt.show()

In [None]:
# Filter the dataset for mood, arousal, and valence
mood_data = df[df['variable'] == 'mood']
arousal_data = df[df['variable'] == 'circumplex.arousal']
valence_data = df[df['variable'] == 'circumplex.valence']
activity_data = df[df['variable'] == 'activity']

# Check the range of values and presence of NaN values for mood, arousal, and valence
mood_range = (mood_data['value'].min(), mood_data['value'].max(), mood_data['value'].isnull().sum(axis = 0))
arousal_range = (arousal_data['value'].min(), arousal_data['value'].max(), arousal_data['value'].isnull().sum(axis = 0))
valence_range = (valence_data['value'].min(), valence_data['value'].max(), valence_data['value'].isnull().sum(axis = 0))
activity_range = (activity_data['value'].min(), activity_data['value'].max(), activity_data['value'].isnull().sum(axis = 0))

mood_range, arousal_range, valence_range, activity_range


In [None]:
mood_data['time'].nunique(), arousal_data['time'].nunique(), valence_data['time'].nunique(), activity_data['time'].nunique()

In [None]:
mood_data['date'].nunique(), arousal_data['date'].nunique(), valence_data['date'].nunique(), activity_data['date'].nunique()

In [None]:
# Assuming 'df' is your loaded DataFrame containing all the variables
# Convert 'time' column to datetime if not already
variables = [
    'mood', 'circumplex.arousal', 'circumplex.valence', 'activity', 'screen', 'call', 'sms',
    'appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance', 'appCat.game',
    'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.unknown',
    'appCat.utilities', 'appCat.weather'
]
# Get unique dates for each variable
unique_dates = {var: df[df['variable'] == var]['date'].nunique() for var in variables}
unique_times = {var: df[df['variable'] == var]['time'].nunique() for var in variables}

# Identify the date range for each variable
date_ranges = {var: (df[df['variable'] == var]['time'].min(), df[df['variable'] == var]['time'].max()) for var in variables}

print("Unique Times:", unique_times)
print("Unique Dates:", unique_dates)
print("Date Ranges:", date_ranges)

In [None]:
data_for_df = {
    'Variable': [],
    'Unique Times': [],
    'Unique Dates': [],
    'Date Range Start': [],
    'Date Range End': []
}

# Populate the dictionary with data for each variable
for var in variables:
    data_for_df['Variable'].append(var)
    data_for_df['Unique Times'].append(df[df['variable'] == var]['time'].nunique())
    data_for_df['Unique Dates'].append(df[df['variable'] == var]['date'].nunique())
    data_for_df['Date Range Start'].append(df[df['variable'] == var]['time'].min())
    data_for_df['Date Range End'].append(df[df['variable'] == var]['time'].max())

# Create the DataFrame
variables_df = pd.DataFrame(data_for_df)

# Display the resulting DataFrame
variables_df

## nan values scores

In [None]:
nan_rows_arousal = df[(df['variable'] == 'circumplex.arousal') & (df['value'].isna())].index
nan_rows_valence = df[(df['variable'] == 'circumplex.valence') & (df['value'].isna())].index
nan_rows_activity = df[(df['variable'] == 'activity') & (df['value'].isna())].index


# Combine the indices of rows with NaN values for arousal and valence
nan_rows_combined = nan_rows_arousal.union(nan_rows_valence)

In [None]:
nan_rows_combined

In [None]:
df_nan = df.loc[nan_rows_combined]
df_nan.shape

## negative values in time

In [None]:
# List of variables/categories to check for negative values, excluding mood, arousal, and valence
variables_to_check = [variable for variable in df['variable'].unique() if variable not in ['mood', 'circumplex.arousal', 'circumplex.valence']]

# Check for negative values in the remaining variables
negative_values_check = {variable: (df[df['variable'] == variable]['value'] < 0).any() for variable in variables_to_check}

negative_values_check

# Identify rows with negative values in appCat.builtin and appCat.entertainment in the original dataset
negative_values_builtin = df[(df['variable'] == 'appCat.builtin') & (df['value'] < 0)].index
negative_values_entertainment = df[(df['variable'] == 'appCat.entertainment') & (df['value'] < 0)].index

#neg combined 
neg = negative_values_builtin.union(negative_values_entertainment)
# Combine the indices of rows with negative values for appCat.builtin and appCat.entertainment
# with previously identified NaN rows for removal
remove_combined = nan_rows_combined.union(negative_values_builtin).union(negative_values_entertainment)

df_negative = df.loc[neg]

In [None]:
df_negative

## nan scores plus neg times

In [None]:
combined = df.loc[remove_combined]

combined

# Clean datasets

In [None]:
df1 = df.drop(combined.index)

In [None]:
# Assuming 'df' is your loaded DataFrame containing all the variables
# Convert 'time' column to datetime if not already
variables = [
    'mood', 'circumplex.arousal', 'circumplex.valence', 'activity', 'screen', 'call', 'sms',
    'appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance', 'appCat.game',
    'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.unknown',
    'appCat.utilities', 'appCat.weather'
]
# Get unique dates for each variable
unique_dates = {var: df1[df1['variable'] == var]['date'].nunique() for var in variables}
unique_times = {var: df1[df1['variable'] == var]['time'].nunique() for var in variables}

# Identify the date range for each variable
date_ranges = {var: (df1[df1['variable'] == var]['time'].min(), df[df['variable'] == var]['time'].max()) for var in variables}

print("Unique Times:", unique_times)
print("Unique Dates:", unique_dates)
print("Date Ranges:", date_ranges)

In [None]:
data_for_df1 = {
    'Variable': [],
    'Unique Times': [],
    'Unique Dates': [],
    'Date Range Start': [],
    'Date Range End': []
}

# Populate the dictionary with data for each variable
for var in variables:
    data_for_df1['Variable'].append(var)
    data_for_df1['Unique Times'].append(df[df['variable'] == var]['time'].nunique())
    data_for_df1['Unique Dates'].append(df[df['variable'] == var]['date'].nunique())
    data_for_df1['Date Range Start'].append(df[df['variable'] == var]['time'].min())
    data_for_df1['Date Range End'].append(df[df['variable'] == var]['time'].max())

# Create the DataFrame
variables_df1 = pd.DataFrame(data_for_df)

# Display the resulting DataFrame
variables_df1

### scores and time

In [None]:
score_variables = ["mood", "circumplex.arousal", "circumplex.valence", "activity"]

# Creating a dataset with only the selected variables
df_score = df1[df1['variable'].isin(score_variables)]

# Creating another dataset with the rest of the variables
df_machine = df1[~df1['variable'].isin(score_variables)]

df_score['date'] = df_score['time'].dt.date
df_machine['date'] = df_machine['time'].dt.date


In [None]:
df_score

In [None]:
df_scores_daily = df_score.drop('time', axis = 1)
df_machine_daily = df_machine.drop('time', axis = 1)

In [None]:
df_scores_daily
#df_scores_daily.to_csv('scores_daily.csv')

In [None]:
grouped_scores = df_scores_daily.groupby(['id', 'date', 'variable'])['value'].mean().reset_index()
grouped_scores

In [None]:
piv_scores = df_scores_daily.groupby(['id','date', 'variable'])['value'].mean().unstack()
#grouped_activities = df_scores_daily.groupby(['id','date', 'variable'])['value'].mean().unstack().fillna(0)

In [None]:
grouped_times = df_machine_daily.groupby(['id', 'date', 'variable'])['value'].sum().reset_index()
#grouped_times

In [None]:
piv_times = df_machine_daily.groupby(['id','date', 'variable'])['value'].sum().unstack()
#piv_times

In [None]:
piv_times.isnull().sum()

In [None]:
#df_machine_daily.to_csv('time_daily.csv')

In [None]:
df_scores_daily['id'].nunique(), df_scores_daily['date'].nunique()

In [None]:
df_machine_daily['id'].nunique(), df_machine_daily['date'].nunique()

### pivot scores and time

In [None]:
scores_pivot_df = df_scores_daily.pivot_table(index=['id','date'], columns='variable', values='value', aggfunc='mean').reset_index()
#scores_pivot_df

### time

In [None]:
time_pivot_df = df_machine_daily.pivot_table(index=['id','date'], columns='variable', values='value', aggfunc='sum').reset_index()

In [None]:
time_pivot_df.isnull().sum()

In [None]:
app_categories_columns = [
    "appCat.builtin", "appCat.communication", "appCat.entertainment",
    "appCat.finance", "appCat.game", "appCat.office", "appCat.other",
    "appCat.social", "appCat.travel", "appCat.unknown", "appCat.utilities",
    "appCat.weather"
]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in app_categories_columns if col in time_pivot_df.columns]

# Replace NaN values with 0 for the specified app category columns
time_pivot_df[existing_app_columns] = time_pivot_df[existing_app_columns].fillna(0)
#time_pivot_df



In [None]:
columns_to_interpolate = ['call',  'sms']

# Perform linear interpolation on the specified columns
time_pivot_df[columns_to_interpolate] = time_pivot_df[columns_to_interpolate].interpolate(method='linear', limit_direction='forward', axis=0)

# Perform linear interpolation on the specified columns
time_pivot_df[columns_to_interpolate] = time_pivot_df[columns_to_interpolate].interpolate(method='linear', limit_direction='backward', axis=0)
time_pivot_df
time_pivot_df.to_csv('out.csv') 



In [None]:
merged_df = pd.merge(scores_pivot_df, time_pivot_df, on=['id','date'], how='inner')
merged_df

In [None]:
columns_to_interpolate = ['call', 'sms']

# Perform linear interpolation on the specified columns
merged_df[columns_to_interpolate] = merged_df[columns_to_interpolate].interpolate(method='linear', limit_direction='forward', axis=0)

# Perform linear interpolation on the specified columns
merged_df[columns_to_interpolate] = merged_df[columns_to_interpolate].interpolate(method='linear', limit_direction='backward', axis=0)
merged_df.to_csv('out.csv') 
merged_df


In [None]:
columns_to_check = ['mood']


df_cleaned = merged_df.dropna(subset=columns_to_check)
df_cleaned

In [None]:
#columns_to_interpolate = ['activity']

# Perform linear interpolation on the specified columns
#df_cleaned[columns_to_interpolate] = df_cleaned[columns_to_interpolate].interpolate(method='linear', limit_direction='forward', axis=0)
#df_cleaned[columns_to_interpolate] = df_cleaned[columns_to_interpolate].interpolate(method='linear', limit_direction='backward', axis=0)
#df_cleaned

In [None]:
not_null_columns = ["circumplex.valence"]

# Ensure the DataFrame has these columns; this prevents KeyError if some columns don't exist
existing_app_columns = [col for col in not_null_columns if col in df_cleaned.columns]

# Replace NaN values with 0 for the specified app category columns
df_cleaned[existing_app_columns] = df_cleaned[existing_app_columns].fillna(0)
df_cleaned


In [None]:
df_cleaned = df_cleaned.drop("activity", axis =1)

In [None]:
df_cleaned.isnull().sum()

In [None]:
df_cleaned.to_csv('cleaned_data.csv')

In [None]:
app_cat_columns = [col for col in df_cleaned.columns if 'appCat.' in col]

df_cleaned['sum_app_categories'] = df_cleaned[app_cat_columns].sum(axis=1)

df_summary = df_cleaned[['sum_app_categories', 'screen', 'call']].copy()
df_summary