In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
%matplotlib inline


In [51]:
base_dir = "data/"

## **Features**

In [52]:
# load users data
users = pd.read_csv(base_dir + "users_new.csv")
users.head()

Unnamed: 0,userid,Name,Age,Motivation,Overall Health Status
0,0,Liu Li,49,Reduce Stress,Poor
1,1,Samantha Jones,50,Build Muscle,Fair
2,2,Siti binti Zainal,18,Increase Energy,Fair
3,3,Michael Smith,55,Build Muscle,Poor
4,4,Tan Ah Kow,46,Build Muscle,Fair


In [53]:
# load task data
tasks = pd.read_csv(base_dir + "tasks_new.csv")
tasks.head()

Unnamed: 0,taskid,Activity,Type,Duration (minutes),Intensity
0,0,Walking 1000 steps,Physical,10,Low
1,1,Sleep 8 hours,Rest,480,Low
2,2,Jogging for 30 minutes,Cardio,30,Medium
3,3,Yoga for 1 hour,Flexibility,60,Low
4,4,Cycling for 45 minutes,Cardio,45,Medium


In [54]:
# load user_task data
userTasks = pd.read_csv(base_dir + "usertasks_new_multi_tasks.csv")
userTasks

Unnamed: 0,interaction_id,taskid,userid,completion,like
0,0,0,0,0,0
1,1,2,0,1,0
2,2,4,0,0,0
3,3,7,0,0,0
4,4,8,0,1,0
...,...,...,...,...,...
5912,5912,88,99,0,1
5913,5913,89,99,0,0
5914,5914,90,99,0,0
5915,5915,94,99,1,1


### Join users and tasks

In [55]:
# join users and tasks
userTasksFeature = pd.merge(userTasks, users, on=["userid"], how='left')
userTasksFeature = pd.merge(userTasksFeature, tasks, on=["taskid"], 
                            how='left')

userTasksFeature

Unnamed: 0,interaction_id,taskid,userid,completion,like,Name,Age,Motivation,Overall Health Status,Activity,Type,Duration (minutes),Intensity
0,0,0,0,0,0,Liu Li,49,Reduce Stress,Poor,Walking 1000 steps,Physical,10,Low
1,1,2,0,1,0,Liu Li,49,Reduce Stress,Poor,Jogging for 30 minutes,Cardio,30,Medium
2,2,4,0,0,0,Liu Li,49,Reduce Stress,Poor,Cycling for 45 minutes,Cardio,45,Medium
3,3,7,0,0,0,Liu Li,49,Reduce Stress,Poor,Strength training for 1 hour,Strength,60,High
4,4,8,0,1,0,Liu Li,49,Reduce Stress,Poor,Dancing for 30 minutes,Cardio,30,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5912,5912,88,99,0,1,Gan Wei Ling,20,Improve Sleep,Good,Darts for 1 hour,Leisure,60,Low
5913,5913,89,99,0,0,Gan Wei Ling,20,Improve Sleep,Good,Bowling for 1 hour,Leisure,60,Low
5914,5914,90,99,0,0,Gan Wei Ling,20,Improve Sleep,Good,Mini-golf for 1 hour,Leisure,60,Low
5915,5915,94,99,1,1,Gan Wei Ling,20,Improve Sleep,Good,Trampolining for 1 hour,Fun,60,Medium


## Split Data

In [56]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# Preprocessing function for training data
def preprocess_for_decision_tree(df):
    # Define label encoders for categorical variables
    encoders = {
        'Motivation': LabelEncoder(),
        'Overall Health Status': LabelEncoder(),
        'Activity': LabelEncoder(),
        'Type': LabelEncoder(),
        'Intensity': LabelEncoder()
    }
    
    # Encode each categorical column
    for column, encoder in encoders.items():
        df[column] = encoder.fit_transform(df[column])
    
    # Rename columns for consistency and remove spaces
    df.columns = [col.replace(' ', '_') for col in df.columns]
    
    return df, encoders

# Function to save label encoders
def save_label_encoders(encoders, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(encoders, file)

# Function to load label encoders
def load_label_encoders(file_path):
    with open(file_path, 'rb') as file:
        encoders = pickle.load(file)
    return encoders

# Function to preprocess new data using loaded encoders
def preprocess_new_data(input_df, encoders):
    df = input_df.copy()
    for column, encoder in encoders.items():
        try:
            df[column] = encoder.transform(df[column])
        except:
            pass

    # Rename columns for consistency and remove spaces
    df.columns = [col.replace(' ', '_') for col in df.columns]
    
    return df

X = userTasksFeature.drop(['completion', 'like', 'interaction_id', 'taskid', 'userid', 'Name'] , axis=1)
y = userTasksFeature[["completion", "like"]].values


X, encoders = preprocess_for_decision_tree(X)
save_label_encoders(encoders, 'label_encoders.pkl')

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=1)

## Build Model

In [58]:
X_train

Unnamed: 0,Age,Motivation,Overall_Health_Status,Activity,Type,Duration_(minutes),Intensity
1947,25,4,3,72,11,480,1
2064,54,5,1,43,6,60,1
834,45,2,0,46,1,60,2
4632,38,4,1,44,9,15,2
4826,41,5,0,39,12,60,2
...,...,...,...,...,...,...,...
905,19,3,3,43,6,60,1
5192,21,4,2,60,3,30,0
3980,20,3,0,15,3,45,2
235,55,0,3,85,5,60,2


In [59]:
y_train

array([[1, 1],
       [1, 1],
       [0, 0],
       ...,
       [1, 1],
       [1, 1],
       [0, 1]])

In [65]:
import lightgbmmt as lgb
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score


num_labels = 2

def self_metric(preds, train_data):
    labels = train_data.get_label()
    labels2 = labels.reshape((num_labels, -1)).transpose()[:,0]
    preds2 = preds.reshape((num_labels, -1)).transpose()[:,0]
    preds2 = 1. / (1. + np.exp(-preds2))
    score = roc_auc_score(labels2, preds2)

    return 'self_metric', score, False


def mymse2(preds, train_data, ep = 0):
    labels = train_data.get_label()
    labels2 = labels.reshape((num_labels, -1)).transpose()
    preds2 = preds.reshape((num_labels, -1)).transpose()
    grad2 = (preds2 - labels2)
    grad = grad2 * np.array([1.5, 0.001])
    grad = np.sum(grad, axis = 1)
    grad2 = grad2.transpose().reshape((-1))
    hess = grad * 0. + 1
    hess2 = grad2 * 0. + 1
    return grad, hess, grad2, hess2

param = {
    'num_leaves': 48, 
    'max_depth': 6,
    'learning_rate': .03,
    'max_bin': 200,
    'lambda_l1': 0.1,
    'lambda_l2': 0.2,
    'verbose': 5,

    # multitask
    'objective': 'custom',
    'num_labels': num_labels, 
    'tree_learner': 'serial2',
    'num_threads': 4}    


evals_result_mt = {}
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)
model = lgb.train(param,
                train_data,
                verbose_eval=10,
                fobj=mymse2, 
                feval=self_metric,
                num_boost_round=200,
                valid_sets=[test_data],
                evals_result=evals_result_mt)
model.set_num_labels(2)

[LightGBM] [Info] Length of label is not same with #data
[LightGBM] [Info] Total Bins 182
[LightGBM] [Info] Number of data points in the train set: 4733, number of used features: 7
[LightGBM] [Info] Length of label is not same with #data
[LightGBM] [Debug] Trained a tree with leaves = 46 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 46 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 39 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 39 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 39 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 39 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 39 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 39 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 39 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 39 and max_depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 39 and max_depth = 6
[Li

In [62]:
model.predict(X_test)

inner_predict 2368


array([[0.39238771, 0.395005  ],
       [0.36714799, 0.41915397],
       [0.45920607, 0.37625475],
       ...,
       [0.28491869, 0.40050745],
       [0.54406954, 0.44521144],
       [0.38855929, 0.37024036]])

## Getting Predictions new a user

In [71]:
def create_prediction_for_user(user_df):
    loaded_encoders = load_label_encoders('label_encoders.pkl')
    user_inference_df = user_df.merge(tasks, how='cross') # cross join
    # create training data 
    user_processed_inference_df = preprocess_new_data(user_inference_df, loaded_encoders)
    user_processed_inference_df = user_processed_inference_df[["Age", "Motivation", "Overall_Health_Status", 
                                                               "Activity", "Type", "Duration_(minutes)", "Intensity"]]
    user_processed_inference_df = user_processed_inference_df.fillna(0)

    user_prediction = model.predict(user_processed_inference_df)

    user_prediction = pd.DataFrame(user_prediction)
    output_df = pd.concat([user_inference_df, user_prediction], axis=1)
    output_df = output_df.sort_values(by=1, ascending=False)

    return output_df

In [72]:
data = {
    'Name': ['Nate Lee'],
    'Age': [35],
    'Motivation': ['Improve Sleep'],
    'Overall Health Status': ['Poor']
}

nate_df = pd.DataFrame(data)
nate_prediction = create_prediction_for_user(nate_df)
nate_prediction.head(20)

inner_predict 196


Unnamed: 0,Name,Age,Motivation,Overall Health Status,taskid,Activity,Type,Duration (minutes),Intensity,0,1
71,Nate Lee,35,Improve Sleep,Poor,71,Office workout for 15 minutes,Physical,15,Medium,0.412669,0.631158
75,Nate Lee,35,Improve Sleep,Poor,75,Table tennis for 1 hour,Sports,60,Medium,0.359353,0.577442
31,Nate Lee,35,Improve Sleep,Poor,31,Aerobics for 1 hour,Cardio,60,Medium,0.591764,0.543143
86,Nate Lee,35,Improve Sleep,Poor,86,Laser shooting for 1 hour,Skill,60,Medium,0.440499,0.539979
23,Nate Lee,35,Improve Sleep,Poor,23,Snowboarding for 2 hours,Outdoor,120,Medium,0.546289,0.539275
36,Nate Lee,35,Improve Sleep,Poor,36,Playing with children for 1 hour,Social,60,Medium,0.426069,0.538508
28,Nate Lee,35,Improve Sleep,Poor,28,Playing volleyball for 1 hour,Sports,60,Medium,0.431905,0.538235
55,Nate Lee,35,Improve Sleep,Poor,55,Playing baseball for 1 hour,Sports,60,Medium,0.408241,0.537604
24,Nate Lee,35,Improve Sleep,Poor,24,Skiing for 2 hours,Outdoor,120,Medium,0.562334,0.535759
49,Nate Lee,35,Improve Sleep,Poor,49,Walking uphill for 30 minutes,Cardio,30,High,0.685491,0.535478


In [73]:
data = {
    'Name': ['Akshay Anand'],
    'Age': [40],
    'Motivation': ['Increase Energy'],
    'Overall Health Status': ['Excellent']
}

akshay_df = pd.DataFrame(data)
akshay_prediction = create_prediction_for_user(akshay_df)
akshay_prediction.head(20)

inner_predict 196


Unnamed: 0,Name,Age,Motivation,Overall Health Status,taskid,Activity,Type,Duration (minutes),Intensity,0,1
71,Akshay Anand,40,Increase Energy,Excellent,71,Office workout for 15 minutes,Physical,15,Medium,0.556217,0.684249
4,Akshay Anand,40,Increase Energy,Excellent,4,Cycling for 45 minutes,Cardio,45,Medium,0.817026,0.662797
31,Akshay Anand,40,Increase Energy,Excellent,31,Aerobics for 1 hour,Cardio,60,Medium,0.834762,0.643583
52,Akshay Anand,40,Increase Energy,Excellent,52,Elliptical trainer for 30 minutes,Cardio,30,Medium,0.765199,0.638229
8,Akshay Anand,40,Increase Energy,Excellent,8,Dancing for 30 minutes,Cardio,30,Medium,0.767885,0.629755
39,Akshay Anand,40,Increase Energy,Excellent,39,Biking to work for 30 minutes,Cardio,30,Medium,0.768505,0.629294
2,Akshay Anand,40,Increase Energy,Excellent,2,Jogging for 30 minutes,Cardio,30,Medium,0.714301,0.620527
51,Akshay Anand,40,Increase Energy,Excellent,51,Jogging on a treadmill for 30 minutes,Cardio,30,Medium,0.714301,0.620527
75,Akshay Anand,40,Increase Energy,Excellent,75,Table tennis for 1 hour,Sports,60,Medium,0.472253,0.610472
24,Akshay Anand,40,Increase Energy,Excellent,24,Skiing for 2 hours,Outdoor,120,Medium,0.746549,0.608424
