In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
%matplotlib inline


In [18]:
base_dir = "data/"

## **Features**

In [19]:
# load users data
users = pd.read_csv(base_dir + "users.csv")
users.head()

Unnamed: 0,userid,Name,Age,Motivation,Segment
0,0,Karen Jones,28,Feeling Healthy,1
1,1,Lin Wei,43,Maintain Independence,3
2,2,Peter Tan,27,Manage Stress,2
3,3,Nisha Nair,28,Maintain Independence,3
4,4,Elizabeth Tan,46,Maintain Independence,1


In [20]:
# load task data
tasks = pd.read_csv(base_dir + "tasks.csv")
tasks.head()

Unnamed: 0,taskid,Activity,Type,Duration (minutes),Intensity
0,0,Walking 10000 steps,Steps Tracking,60,Low
1,1,Jogging 5000 steps,Steps Tracking,30,Medium
2,2,Running 7000 steps,Steps Tracking,40,High
3,3,Starting daily step count,Start Steps,5,Low
4,4,Walking in place for 5 minutes,Start Steps,5,Low


In [21]:
# load user_task data
userTasks = pd.read_csv(base_dir + "usertasks.csv")
userTasks

Unnamed: 0,interaction_id,taskid,userid,completion,like
0,0,0,0,1,False
1,1,1,0,0,False
2,2,2,0,0,False
3,3,3,0,0,False
4,4,4,0,0,False
...,...,...,...,...,...
5884,5884,61,99,1,True
5885,5885,62,99,0,False
5886,5886,63,99,1,True
5887,5887,64,99,0,False


### Join users and tasks

In [22]:
# join users and tasks
userTasksFeature = pd.merge(userTasks, users, on=["userid"], how='left')
userTasksFeature = pd.merge(userTasksFeature, tasks, on=["taskid"], 
                            how='left')

userTasksFeature

Unnamed: 0,interaction_id,taskid,userid,completion,like,Name,Age,Motivation,Segment,Activity,Type,Duration (minutes),Intensity
0,0,0,0,1,False,Karen Jones,28,Feeling Healthy,1,Walking 10000 steps,Steps Tracking,60,Low
1,1,1,0,0,False,Karen Jones,28,Feeling Healthy,1,Jogging 5000 steps,Steps Tracking,30,Medium
2,2,2,0,0,False,Karen Jones,28,Feeling Healthy,1,Running 7000 steps,Steps Tracking,40,High
3,3,3,0,0,False,Karen Jones,28,Feeling Healthy,1,Starting daily step count,Start Steps,5,Low
4,4,4,0,0,False,Karen Jones,28,Feeling Healthy,1,Walking in place for 5 minutes,Start Steps,5,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5884,5884,61,99,1,True,Ananda Krishna,52,Feeling Healthy,1,Breathing exercises for relaxation,Sleep Exercise,15,Low
5885,5885,62,99,0,False,Ananda Krishna,52,Feeling Healthy,1,Yoga for better sleep,Sleep Exercise,20,Low
5886,5886,63,99,1,True,Ananda Krishna,52,Feeling Healthy,1,Power nap during the day,Sleep Recharge,20,Low
5887,5887,64,99,0,False,Ananda Krishna,52,Feeling Healthy,1,Relaxation time in the afternoon,Sleep Recharge,30,Low


## Split Data

In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# Preprocessing function for training data
def preprocess_for_decision_tree(df):
    # Define label encoders for categorical variables
    encoders = {
        'Motivation': LabelEncoder(),
        'Segment': LabelEncoder(),
        'Activity': LabelEncoder(),
        'Type': LabelEncoder(),
        'Intensity': LabelEncoder()
    }
    
    # Encode each categorical column
    for column, encoder in encoders.items():
        df[column] = encoder.fit_transform(df[column])
    
    # Rename columns for consistency and remove spaces
    df.columns = [col.replace(' ', '_') for col in df.columns]
    
    return df, encoders

# Function to save label encoders
def save_label_encoders(encoders, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(encoders, file)

# Function to load label encoders
def load_label_encoders(file_path):
    with open(file_path, 'rb') as file:
        encoders = pickle.load(file)
    return encoders

# Function to preprocess new data using loaded encoders
def preprocess_new_data(input_df, encoders):
    df = input_df.copy()
    for column, encoder in encoders.items():
        try:
            df[column] = encoder.transform(df[column])
        except:
            pass

    # Rename columns for consistency and remove spaces
    df.columns = [col.replace(' ', '_') for col in df.columns]
    
    return df

X = userTasksFeature.drop(['completion', 'like', 'interaction_id', 'taskid', 'userid', 'Name'] , axis=1)
y = userTasksFeature[["completion", "like"]].values


X, encoders = preprocess_for_decision_tree(X)
save_label_encoders(encoders, 'label_encoders.pkl')

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=1)

## Build Model

In [25]:
X_train

Unnamed: 0,Age,Motivation,Segment,Activity,Type,Duration_(minutes),Intensity
1344,25,2,2,28,5,15,1
5420,38,2,2,6,12,20,1
3644,47,1,0,42,5,30,1
4388,37,1,1,38,2,10,1
2440,38,1,2,7,11,20,2
...,...,...,...,...,...,...,...
905,33,1,2,11,21,25,0
5192,47,1,1,16,6,30,2
3980,63,0,2,38,2,10,1
235,46,1,0,29,20,30,2


In [26]:
y_train

array([[0, False],
       [0, False],
       [0, False],
       ...,
       [0, True],
       [0, False],
       [0, True]], dtype=object)

In [27]:
import lightgbmmt as lgb
import numpy as np
from sklearn.metrics import roc_auc_score


num_labels = 2

def self_metric(preds, train_data):
    labels = train_data.get_label()
    labels2 = labels.reshape((num_labels, -1)).transpose()[:,0]
    preds2 = preds.reshape((num_labels, -1)).transpose()[:,0]
    preds2 = 1. / (1. + np.exp(-preds2))
    score = roc_auc_score(labels2, preds2)

    return 'self_metric', score, False


def mymse2(preds, train_data, ep = 0):
    labels = train_data.get_label()
    labels2 = labels.reshape((num_labels, -1)).transpose()
    preds2 = preds.reshape((num_labels, -1)).transpose()
    grad2 = (preds2 - labels2)
    grad = grad2 * np.array([1.5, 0.001])
    grad = np.sum(grad, axis = 1)
    grad2 = grad2.transpose().reshape((-1))
    hess = grad * 0. + 1
    hess2 = grad2 * 0. + 1
    return grad, hess, grad2, hess2

param = {
    'num_leaves': 48, 
    'max_depth': 6,
    'learning_rate': .03,
    'max_bin': 200,
    'lambda_l1': 0.1,
    'lambda_l2': 0.2,
    'verbose': 5,

    # multitask
    'objective': 'custom',
    'num_labels': num_labels, 
    'tree_learner': 'serial2',
    'num_threads': 4}    


evals_result_mt = {}
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)
model = lgb.train(param,
                train_data,
                verbose_eval=10,
                fobj=mymse2, 
                feval=self_metric,
                num_boost_round=200,
                valid_sets=[test_data],
                evals_result=evals_result_mt)
model.set_num_labels(2)

[LightGBM] [Info] Length of label is not same with #data
[LightGBM] [Info] Total Bins 148
[LightGBM] [Info] Number of data points in the train set: 4711, number of used features: 7
[LightGBM] [Info] Length of label is not same with #data
[LightGBM] [Debug] Trained a tree with leaves = 47 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 47 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 48 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 48 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 48 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 48 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 48 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 48 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 48 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 48 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 48 and max_depth = 6
[Li

In [47]:
import pickle

with open('model/model.pkl', 'wb') as file:
    pickle.dump(model, file)


In [44]:
model.predict(X_test)

inner_predict 2356


array([[0.331868  , 0.42453904],
       [0.36373009, 0.41014847],
       [0.42586556, 0.44912955],
       ...,
       [0.53867309, 0.42624756],
       [0.36376169, 0.44990994],
       [0.43382574, 0.41909875]])

## Getting Predictions new a user

In [41]:
def create_prediction_for_user(user_df):
    loaded_encoders = load_label_encoders('label_encoders.pkl')
    user_inference_df = user_df.merge(tasks, how='cross') # cross join
    # create training data 
    user_processed_inference_df = preprocess_new_data(user_inference_df, loaded_encoders)
    user_processed_inference_df = user_processed_inference_df[["Age", "Motivation", "Segment", 
                                                               "Activity", "Type", "Duration_(minutes)", "Intensity"]]
    user_processed_inference_df = user_processed_inference_df.fillna(0)

    user_prediction = model.predict(user_processed_inference_df)

    user_prediction = pd.DataFrame(data=user_prediction, columns=["Completion Prob", "Like Prob"])
    output_df = pd.concat([user_inference_df, user_prediction], axis=1)
    output_df = output_df.sort_values(by="Completion Prob", ascending=False)

    return output_df

In [42]:
data = {
    'Name': ['Nate Lee'],
    'Age': [35],
    'Motivation': ['Feeling Healthy'],
    'Segment': ['1']
}

nate_df = pd.DataFrame(data)
nate_prediction = create_prediction_for_user(nate_df)
nate_prediction.head(20)

inner_predict 132


Unnamed: 0,Name,Age,Motivation,Segment,taskid,Activity,Type,Duration (minutes),Intensity,Completion Prob,Like Prob
62,Nate Lee,35,Feeling Healthy,1,62,Yoga for better sleep,Sleep Exercise,20,Low,0.758466,0.492997
0,Nate Lee,35,Feeling Healthy,1,0,Walking 10000 steps,Steps Tracking,60,Low,0.468615,0.344021
63,Nate Lee,35,Feeling Healthy,1,63,Power nap during the day,Sleep Recharge,20,Low,0.452985,0.40692
35,Nate Lee,35,Feeling Healthy,1,35,Using a meal logging app,Simple Meal Log,15,Low,0.446268,0.366575
29,Nate Lee,35,Feeling Healthy,1,29,10-minute desk stretches,Sedentary Simple,10,Low,0.442088,0.400216
64,Nate Lee,35,Feeling Healthy,1,64,Relaxation time in the afternoon,Sleep Recharge,30,Low,0.440259,0.410446
34,Nate Lee,35,Feeling Healthy,1,34,Recording food intake after each meal,Simple Meal Log,5,Low,0.421177,0.441658
49,Nate Lee,35,Feeling Healthy,1,49,Using a sleep tracking app,Sleep Tracking,10,Low,0.413413,0.312049
11,Nate Lee,35,Feeling Healthy,1,11,10-minute cycling,Start MVPA,10,Medium,0.411393,0.359703
2,Nate Lee,35,Feeling Healthy,1,2,Running 7000 steps,Steps Tracking,40,High,0.410207,0.376357


In [43]:
data = {
    'Name': ['Akshay Anand'],
    'Age': [40],
    'Motivation': ['Maintain Independence'],
    'Segment': ['2']
}

akshay_df = pd.DataFrame(data)
akshay_prediction = create_prediction_for_user(akshay_df)
akshay_prediction.head(20)

inner_predict 132


Unnamed: 0,Name,Age,Motivation,Segment,taskid,Activity,Type,Duration (minutes),Intensity,Completion Prob,Like Prob
57,Akshay Anand,40,Maintain Independence,2,57,Setting a consistent bedtime,Encourage Consistent Bedtime,5,Low,0.650845,0.415734
58,Akshay Anand,40,Maintain Independence,2,58,Creating a bedtime routine,Encourage Consistent Bedtime,20,Low,0.593886,0.438838
37,Akshay Anand,40,Maintain Independence,2,37,Tracking energy levels post-meal,Meal Log Benefit,10,Low,0.555513,0.404369
39,Akshay Anand,40,Maintain Independence,2,39,Substituting sugary snacks with fruits,Healthier Dietary Substitutes,10,Low,0.55009,0.451139
60,Akshay Anand,40,Maintain Independence,2,60,Light stretching before bed,Sleep Exercise,10,Low,0.535231,0.41916
40,Akshay Anand,40,Maintain Independence,2,40,Replacing white bread with whole grain,Healthier Dietary Substitutes,10,Low,0.534593,0.435379
33,Akshay Anand,40,Maintain Independence,2,33,Logging meals daily,Simple Meal Log,10,Low,0.532077,0.420293
41,Akshay Anand,40,Maintain Independence,2,41,Opting for lean proteins,Healthier Dietary Substitutes,10,Low,0.531521,0.43485
59,Akshay Anand,40,Maintain Independence,2,59,Using an alarm to signal bedtime,Encourage Consistent Bedtime,5,Low,0.525023,0.36615
22,Akshay Anand,40,Maintain Independence,2,22,20-minute sprint intervals,Vigorous Aerobic,20,High,0.518958,0.338998
