In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# from google.colab import files
# uploaded = files.upload()

path = './../data/'

users_df = pd.read_csv(path + 'users.csv')
suggestions_df = pd.read_csv(path + 'suggestions.csv')
jbsteps_df = pd.read_csv(path + 'jbsteps.csv')
gfsteps_df = pd.read_csv(path + 'gfsteps.csv')

# print(users_df.head())
# print(suggestions_df.head())
# print(jbsteps_df.head())
# print(gfsteps_df.head())


In [None]:
# 1. Checking for missing values across all datasets
missing_values_suggestions = suggestions_df.isnull().sum()
missing_values_users = users_df.isnull().sum()
missing_values_jbsteps = jbsteps_df.isnull().sum()
missing_values_gfsteps = gfsteps_df.isnull().sum()

In [None]:
## Print missing values summary
print("Missing Values in Suggestions Dataset:")
print(missing_values_suggestions)
print("\nMissing Values in Users Dataset:")
print(missing_values_users)
print("\nMissing Values in JBSteps Dataset:")
print(missing_values_jbsteps)
print("\nMissing Values in GFSteps Dataset:")
print(missing_values_gfsteps)

In [None]:
# 2. Assessing column names
columns_suggestions = suggestions_df.columns
columns_users = users_df.columns
columns_jbsteps = jbsteps_df.columns
columns_gfsteps = gfsteps_df.columns

In [None]:
# Print column names
print("\nColumn Names in Suggestions Dataset:")
print(columns_suggestions)
print("\nColumn Names in Users Dataset:")
print(columns_users)
print("\nColumn Names in JBSteps Dataset:")
print(columns_jbsteps)
print("\nColumn Names in GFSteps Dataset:")
print(columns_gfsteps)

In [None]:
# 3. Checking data completeness by calculating the percentage of missing values
data_completeness_suggestions = suggestions_df.isnull().mean() * 100
data_completeness_users = users_df.isnull().mean() * 100
data_completeness_jbsteps = jbsteps_df.isnull().mean() * 100
data_completeness_gfsteps = gfsteps_df.isnull().mean() * 100

In [None]:
# Print data completeness
print("\nData Completeness in Suggestions Dataset (% of missing values):")
print(data_completeness_suggestions)
print("\nData Completeness in Users Dataset (% of missing values):")
print(data_completeness_users)
print("\nData Completeness in JBSteps Dataset (% of missing values):")
print(data_completeness_jbsteps)
print("\nData Completeness in GFSteps Dataset (% of missing values):")
print(data_completeness_gfsteps)

In [None]:
# Step 1: Handling Missing Values
# We will handle missing values by either removing or imputing them based on the percentage of missing data.

# Remove columns with more than 40% missing data in each dataset
threshold = 0.4

suggestions_cleaned = suggestions_df.loc[:, suggestions_df.isnull().mean() < threshold]
users_cleaned = users_df.loc[:, users_df.isnull().mean() < threshold]
jbsteps_cleaned = jbsteps_df.loc[:, jbsteps_df.isnull().mean() < threshold]
gfsteps_cleaned = gfsteps_df.loc[:, gfsteps_df.isnull().mean() < threshold]

In [None]:
print("\n Cleaned suggestions data ")
print(suggestions_cleaned.head())

In [None]:
# For columns with less missing data, we will impute the missing values with the median (for numerical) and mode (for categorical).
# Handle missing values explicitly using .loc
suggestions_cleaned.loc[:, :] = suggestions_cleaned.fillna(suggestions_cleaned.median(numeric_only=True))
suggestions_cleaned.loc[:, :] = suggestions_cleaned.fillna(suggestions_cleaned.mode().iloc[0])

users_cleaned.loc[:, :] = users_cleaned.fillna(users_cleaned.median(numeric_only=True))
users_cleaned.loc[:, :] = users_cleaned.fillna(users_cleaned.mode().iloc[0])

jbsteps_cleaned.loc[:, :] = jbsteps_cleaned.fillna(jbsteps_cleaned.median(numeric_only=True))
jbsteps_cleaned.loc[:, :] = jbsteps_cleaned.fillna(jbsteps_cleaned.mode().iloc[0])

gfsteps_cleaned.loc[:, :] = gfsteps_cleaned.fillna(gfsteps_cleaned.median(numeric_only=True))
gfsteps_cleaned.loc[:, :] = gfsteps_cleaned.fillna(gfsteps_cleaned.mode().iloc[0])


In [None]:
# Step 2: Data Type Validation
# Convert date/time columns to datetime format if not already

# Convert in suggestions dataset

if 'sugg.select.utime' in suggestions_cleaned.columns:
    suggestions_cleaned.loc[:, 'sugg.select.utime'] = pd.to_datetime(suggestions_cleaned['sugg.select.utime'], errors='coerce')

# Convert in jbsteps dataset
if 'steps.utime' in jbsteps_cleaned.columns:
    jbsteps_cleaned.loc[:, 'steps.utime'] = pd.to_datetime(jbsteps_cleaned['steps.utime'], errors='coerce')

# Convert in gfsteps dataset
if 'steps.utime' in gfsteps_cleaned.columns:
    gfsteps_cleaned.loc[:, 'steps.utime'] = pd.to_datetime(gfsteps_cleaned['steps.utime'], errors='coerce')

In [None]:

# Displaying the shape of cleaned data after handling missing values and data type corrections
cleaned_data_shapes = {
    "Suggestions Cleaned Shape": suggestions_cleaned.shape,
    "Users Cleaned Shape": users_cleaned.shape,
    "JBSteps Cleaned Shape": jbsteps_cleaned.shape,
    "GFSteps Cleaned Shape": gfsteps_cleaned.shape
}

cleaned_data_shapes

***Feature Engineering:***
We’ll focus on aggregating step data and calculating the impact of suggestions on user activity. Here's how we can approach it:

Aggregating Step Data: We’ll aggregate step data by user, time intervals (e.g., daily or hourly), and then calculate step counts before and after suggestions.

Impact of Suggestions: We’ll compute the difference in steps before and after each suggestion to assess how much the suggestions affect user activity.

Plan:
Aggregate the step data from JBSteps and GFSteps by user and time intervals.

Merge the step data with the Suggestions dataset to analyze the steps before and after the suggestions.

Calculate the difference in steps before and after the suggestion as a feature representing the impact of suggestions.

In [None]:
# Aggregating JBSteps data by user and hour
# Convert 'steps.utime' to datetime format
jbsteps_cleaned['steps.utime'] = pd.to_datetime(jbsteps_cleaned['steps.utime'], errors='coerce')

# Now proceed with the aggregation by user and hour
jbsteps_agg = jbsteps_cleaned.groupby([jbsteps_cleaned['user.index'], jbsteps_cleaned['steps.utime'].dt.hour]).agg(
    total_steps_jb=('steps', 'sum')
).reset_index()


In [None]:
# Aggregating GFSteps data by user and hour
# For GFSteps
gfsteps_cleaned['steps.utime'] = pd.to_datetime(gfsteps_cleaned['steps.utime'], errors='coerce')

# Aggregating GFSteps data by user and hour
gfsteps_agg = gfsteps_cleaned.groupby([gfsteps_cleaned['user.index'], gfsteps_cleaned['steps.utime'].dt.hour]).agg(
    total_steps_gf=('steps', 'sum')
).reset_index()

In [None]:
# We merge based on user.index and hour (assume that the suggestion times are aligned with hours)
suggestions_cleaned.loc[:, 'hour'] = pd.to_datetime(suggestions_cleaned['sugg.select.utime']).dt.hour

In [None]:
# Extract hour from the original jbsteps_cleaned dataset
jbsteps_cleaned['hour'] = jbsteps_cleaned['steps.utime'].dt.hour

# Aggregating JBSteps data by user and hour, summing the steps
jbsteps_agg = jbsteps_cleaned.groupby(['user.index', 'hour']).agg(
    total_steps_jb=('steps', 'sum')
).reset_index()

# Display the updated jbsteps_agg with proper 'hour' values
jbsteps_agg.head()

In [None]:
# Extract hour from the original gfsteps_cleaned dataset
gfsteps_cleaned['hour'] = gfsteps_cleaned['steps.utime'].dt.hour

# Aggregating GFSteps data by user and hour, summing the steps
gfsteps_agg = gfsteps_cleaned.groupby(['user.index', 'hour']).agg(
    total_steps_gf=('steps', 'sum')
).reset_index()

# Display the updated gfsteps_agg with proper 'hour' values
gfsteps_agg.head()

In [None]:
# Merge with JBSteps
merged_data_jb = pd.merge(suggestions_cleaned, jbsteps_agg, how='left', on=['user.index', 'hour'])

In [None]:
# Merge with GFSteps
merged_data = pd.merge(merged_data_jb, gfsteps_agg, how='left', on=['user.index', 'hour'])

In [None]:
# Calculate the difference in steps using available step data
# We'll use jbsteps60pre and gfsteps60pre for step counts before the suggestion (if they exist) and total_steps_jb and total_steps_gf for after.

# Calculating step difference for Jawbone
if 'jbsteps60pre' in merged_data.columns:
    merged_data['step_diff_jb'] = merged_data['total_steps_jb'] - merged_data['jbsteps60pre']
else:
    merged_data['step_diff_jb'] = None

# Calculating step difference for Google Fit
if 'gfsteps60pre' in merged_data.columns:
    merged_data['step_diff_gf'] = merged_data['total_steps_gf'] - merged_data['gfsteps60pre']
else:
    merged_data['step_diff_gf'] = None

# Display the first few rows of the dataset with step differences
merged_data[['user.index', 'step_diff_jb', 'step_diff_gf']].head()

In [None]:
# Merge JBSteps (merged_data_jb) and Users datasets
final_merged_data = pd.merge(merged_data_jb, users_cleaned, how='left', on='user.index')

# Now, select relevant columns from Suggestions and Users datasets
# Relevant columns from Users dataset (e.g., demographic and survey data)
users_features_updated = [
    'user.index', 'intake.survey.utime', 'exit.survey.utime',
    'modact.metmins.exit', 'walk.time.exit', 'vigact.time.exit',
    'sit.time.exit', 'ipaq.hepa.exit'
]

# Selecting relevant columns from Suggestions dataset (e.g., suggestion type, time slots, user context)
suggestions_features = ['user.index', 'send.active', 'send.sedentary', 'sugg.select.slot', 'dec.temperature', 'dec.windspeed']

# Extract these relevant columns for the final dataset
final_selected_data = final_merged_data[suggestions_features + users_features_updated]

# Display the first few rows of the final dataset with selected features
final_selected_data.head()

**Handling Missing Data:** We should handle the missing values in both step_diff_jb and step_diff_gf.

**Fill missing values with a default value (e.g., 0 for no step change)**.

**Drop rows with missing values.**

**Normalization:**

In [None]:
# Handle missing data
# We can fill missing values in step_diff_jb and step_diff_gf with 0 (indicating no step difference where data is missing)

merged_data.loc[:, 'step_diff_jb'] = merged_data['step_diff_jb'].fillna(0)
merged_data.loc[:, 'step_diff_gf'] = merged_data['step_diff_gf'].fillna(0)


In [None]:
#Normalize the step differences for further analysis
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Normalizing step differences (both Jawbone and Google Fit)
merged_data[['step_diff_jb_norm', 'step_diff_gf_norm']] = scaler.fit_transform(merged_data[['step_diff_jb', 'step_diff_gf']])

# Display the normalized columns
merged_data[['user.index', 'step_diff_jb_norm', 'step_diff_gf_norm']].head()

**Statistical Analysis**.
The goal here is to assess whether different types of suggestions (e.g., active vs. sedentary) lead to significant changes in user activity.

Plan:

**Hypothesis:** We test whether active suggestions lead to a significant increase in steps compared to sedentary suggestions.

Statistical Tests:
T-tests: Compare the mean step differences between groups (e.g., active vs. sedentary suggestions).

ANOVA: If we want to test across multiple groups or suggestion types, we can use ANOVA.
Implementation:
Split the data based on the type of suggestion (active vs. sedentary).
Run statistical tests (t-test) to check if the step difference after active suggestions is significantly higher than after sedentary suggestions.

In [None]:
# Reinitialize the bell curve (normal distribution) data
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

# t-statistic and p-value from previous test
t_stat = 1.15
p_value = 0.250

# Generate a bell curve (normal distribution)
mean = 0
std_dev = 1
x = np.linspace(-4, 4, 1000)
y = norm.pdf(x, mean, std_dev)

# Plot the bell curve
plt.figure(figsize=(10, 6))
plt.plot(x, y, label='Normal Distribution (t-distribution approximation)', color='blue')

# Mark the t-statistic on the curve
plt.axvline(x=t_stat, color='red', linestyle='--', label=f't-statistic: {t_stat:.2f}')

# Fill the area under the curve for p-value (two-tailed)
plt.fill_between(x, 0, y, where=(x <= -abs(t_stat)) | (x >= abs(t_stat)), color='red', alpha=0.3, label=f'p-value: {p_value:.3f}')

# Add labels and title
plt.title('T-Test Visualization: P-value on the Bell Curve')
plt.xlabel('t-statistic using JBSteps & Suggestions Data')
plt.ylabel('Probability Density')
plt.legend()

# Show the plot
plt.show()


Interpretation:
The p-value of 0.213 suggests that the difference in step counts between active and sedentary suggestions is not statistically significant at typical significance levels (e.g., 0.05).
This means that, based on the current data, we do not have strong evidence to conclude that active suggestions lead to significantly more steps than sedentary suggestions.

In [None]:
# Check the column names in the merged_data
print(merged_data.columns)

In [None]:
# Use the correct column names based on your dataset
time_series_data = merged_data[['sugg.select.utime', 'total_steps_jb']]  # You can also use 'total_steps_gf' if needed

# Convert 'sugg.select.utime' to datetime format
time_series_data['sugg.select.utime'] = pd.to_datetime(time_series_data['sugg.select.utime'], errors='coerce')

# Set 'sugg.select.utime' as the index and sort the data by time
time_series_data.set_index('sugg.select.utime', inplace=True)
time_series_data = time_series_data.sort_index()

# Drop any missing values if necessary
time_series_data.dropna(inplace=True)

# Display the first few rows to check
print(time_series_data.head())


In [None]:
from statsmodels.tsa.stattools import adfuller

# Perform the Augmented Dickey-Fuller test
result = adfuller(time_series_data['total_steps_jb'])
print('ADF Statistic:', result[0])
print('p-value:', result[1])

# Interpretation: If p-value > 0.05, the series is non-stationary, and differencing may be required

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model on the original data since differencing isn't needed
model = ARIMA(time_series_data['total_steps_jb'], order=(1, 0, 1))  # Adjust p, d, q as needed
model_fit = model.fit()

# Display the model summary
print(model_fit.summary())

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Make predictions
predictions = model_fit.predict(start=0, end=len(time_series_data)-1, dynamic=False)

# Calculate error metrics
mse = mean_squared_error(time_series_data['total_steps_jb'], predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(time_series_data['total_steps_jb'], predictions)

print(f'MSE: {mse}, RMSE: {rmse}, MAE: {mae}')

In [None]:
import itertools

# Define the range for p, d, q
p = d = q = range(0, 3)

# Generate all possible combinations of p, d, q
pdq = list(itertools.product(p, d, q))

best_mse, best_params = float("inf"), None

# Iterate through all combinations and find the best parameters
for param in pdq:
    try:
        model = ARIMA(time_series_data['total_steps_jb_scaled'], order=param)
        model_fit = model.fit()

        # Make predictions
        predictions_scaled = model_fit.predict(start=0, end=len(time_series_data)-1, dynamic=False)
        predictions_scaled = predictions_scaled.values.reshape(-1, 1)
        predictions = scaler.inverse_transform(predictions_scaled)

        # Calculate the MSE
        mse = mean_squared_error(time_series_data['total_steps_jb'], predictions)

        if mse < best_mse:
            best_mse, best_params = mse, param

    except:
        continue

print(f'Best ARIMA parameters: {best_params} with MSE: {best_mse}')


ARIMA FAILED SEVERLY! So optimizing it with another model XGBOOST

In [None]:
# Remove duplicate timestamps from the time series data
time_series_data = time_series_data.loc[~time_series_data.index.duplicated(keep='first')]

# Now create the lag features
def create_lag_features(data, lag=5):
    lagged_data = pd.DataFrame()
    for i in range(1, lag + 1):
        lagged_data[f'lag_{i}'] = data['total_steps_jb'].shift(i)
    return lagged_data

# Create lagged features for total_steps_jb
lagged_features = create_lag_features(time_series_data, lag=5)

# Drop any missing values that were introduced by shifting
lagged_features.dropna(inplace=True)

# Add the current value (target) to the features
lagged_features['target'] = time_series_data['total_steps_jb'][lagged_features.index]

# Display the first few rows of the lagged dataset
print(lagged_features.head())


In [None]:
# Train-test split
split_index = int(len(lagged_features) * 0.8)  # 80% training, 20% testing

train = lagged_features[:split_index]
test = lagged_features[split_index:]

# Split into features (X) and target (y)
X_train = train.drop('target', axis=1)
y_train = train['target']
X_test = test.drop('target', axis=1)
y_test = test['target']


In [None]:
import xgboost as xgb

# Create DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
test_dmatrix = xgb.DMatrix(X_test, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'eta': 0.1
}

# Train the XGBoost model
xgboost_model = xgb.train(params, train_dmatrix, num_boost_round=100)

# Make predictions
y_pred = xgboost_model.predict(test_dmatrix)

# Display first few predictions
print(y_pred[:5])


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Calculate the error metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f'MSE: {mse}, RMSE: {rmse}, MAE: {mae}')


In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Create DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)

# Set up the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10],          # Controls the depth of each tree
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage to prevent overfitting
    'n_estimators': [100, 200, 300],     # Number of trees (boosting rounds)
    'min_child_weight': [1, 3, 5],       # Minimum sum of instance weight needed in a child
    'gamma': [0, 0.1, 0.2, 0.5],         # Minimum loss reduction required to make a further partition on a leaf node
    'subsample': [0.8, 1.0],             # Subsampling ratio of training instances
    'colsample_bytree': [0.8, 1.0],      # Subsampling ratio of features for each tree
}

# Initialize the model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Perform Grid Search with 3-fold cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

# Fit grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
print(f"Best parameters found: {grid_search.best_params_}")

# Use the best parameters to train the model again
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate error metrics
mse_best = mean_squared_error(y_test, y_pred)
rmse_best = np.sqrt(mse_best)
mae_best = mean_absolute_error(y_test, y_pred)

print(f"Best Model MSE: {mse_best}, RMSE: {rmse_best}, MAE: {mae_best}")


In [None]:
# Create new time-based features from the timestamp ('sugg.select.utime')
time_series_data['hour'] = time_series_data.index.hour  # Hour of the day
time_series_data['day'] = time_series_data.index.day  # Day of the month
time_series_data['dayofweek'] = time_series_data.index.dayofweek  # Day of the week (0=Monday, 6=Sunday)
time_series_data['month'] = time_series_data.index.month  # Month of the year

# Display the first few rows to see the new features
print(time_series_data[['total_steps_jb', 'hour', 'day', 'dayofweek', 'month']].head())


In [None]:
# Create lag features (e.g., steps from previous time steps)
def create_lag_features(data, lag=5):
    lagged_data = pd.DataFrame()
    for i in range(1, lag + 1):
        lagged_data[f'lag_{i}'] = data['total_steps_jb'].shift(i)
    return lagged_data

# Create lagged features
lagged_features = create_lag_features(time_series_data, lag=5)

# Combine lag features with the new time-based features
lagged_features = pd.concat([lagged_features, time_series_data[['hour', 'day', 'dayofweek', 'month']]], axis=1)

# Add the current value (target) to the features
lagged_features['target'] = time_series_data['total_steps_jb'][lagged_features.index]

# Drop rows with missing values caused by lagging
lagged_features.dropna(inplace=True)

# Display the first few rows of the dataset
print(lagged_features.head())


In [None]:
# Train-test split
split_index = int(len(lagged_features) * 0.8)  # 80% training, 20% testing

train = lagged_features[:split_index]
test = lagged_features[split_index:]

# Split into features (X) and target (y)
X_train = train.drop('target', axis=1)
y_train = train['target']
X_test = test.drop('target', axis=1)
y_test = test['target']


In [None]:
import xgboost as xgb

# Create DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
test_dmatrix = xgb.DMatrix(X_test, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'eta': 0.1
}

# Train the XGBoost model
xgboost_model = xgb.train(params, train_dmatrix, num_boost_round=100)

# Make predictions
y_pred = xgboost_model.predict(test_dmatrix)

# Display the first few predictions
print(y_pred[:5])


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Calculate error metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f'MSE: {mse}, RMSE: {rmse}, MAE: {mae}')