In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
%matplotlib inline


In [2]:
base_dir = "data_new/"

## **Features**

In [3]:
# load users data
users = pd.read_csv(base_dir + "users_new.csv")
users.head()

Unnamed: 0,userid,Name,Age,Motivation,Overall Health Status
0,0,Liu Li,49,Reduce Stress,Poor
1,1,Samantha Jones,50,Build Muscle,Fair
2,2,Siti binti Zainal,18,Increase Energy,Fair
3,3,Michael Smith,55,Build Muscle,Poor
4,4,Tan Ah Kow,46,Build Muscle,Fair


In [4]:
# load task data
tasks = pd.read_csv(base_dir + "tasks_new.csv")
tasks.head()

Unnamed: 0,taskid,Activity,Type,Duration (minutes),Intensity
0,0,Walking 1000 steps,Physical,10,Low
1,1,Sleep 8 hours,Rest,480,Low
2,2,Jogging for 30 minutes,Cardio,30,Medium
3,3,Yoga for 1 hour,Flexibility,60,Low
4,4,Cycling for 45 minutes,Cardio,45,Medium


In [5]:
# load user_task data
userTasks = pd.read_csv(base_dir + "usertasks_new.csv")
userTasks

Unnamed: 0,interaction_id,taskid,userid,completion
0,0,0,0,1
1,1,1,0,1
2,2,2,0,0
3,3,6,0,0
4,4,9,0,1
...,...,...,...,...
5919,5919,93,99,0
5920,5920,94,99,1
5921,5921,95,99,0
5922,5922,96,99,1


### Join users and tasks

In [6]:
#join users and tasks
userTasksFeature = pd.merge(userTasks, users, on=["userid"], how='left')
userTasksFeature = pd.merge(userTasksFeature, tasks, on=["taskid"], 
                            how='left')

userTasksFeature

Unnamed: 0,interaction_id,taskid,userid,completion,Name,Age,Motivation,Overall Health Status,Activity,Type,Duration (minutes),Intensity
0,0,0,0,1,Liu Li,49,Reduce Stress,Poor,Walking 1000 steps,Physical,10,Low
1,1,1,0,1,Liu Li,49,Reduce Stress,Poor,Sleep 8 hours,Rest,480,Low
2,2,2,0,0,Liu Li,49,Reduce Stress,Poor,Jogging for 30 minutes,Cardio,30,Medium
3,3,6,0,0,Liu Li,49,Reduce Stress,Poor,Meditation for 20 minutes,Mental,20,Low
4,4,9,0,1,Liu Li,49,Reduce Stress,Poor,Hiking for 2 hours,Outdoor,120,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...
5919,5919,93,99,0,Gan Wei Ling,20,Improve Sleep,Good,Shuffleboard for 1 hour,Leisure,60,Low
5920,5920,94,99,1,Gan Wei Ling,20,Improve Sleep,Good,Trampolining for 1 hour,Fun,60,Medium
5921,5921,95,99,0,Gan Wei Ling,20,Improve Sleep,Good,Parkour for 1 hour,Outdoor,60,High
5922,5922,96,99,1,Gan Wei Ling,20,Improve Sleep,Good,Slacklining for 1 hour,Balance,60,Medium


## Split Data

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# Preprocessing function for training data
def preprocess_for_decision_tree(df):
    # Define label encoders for categorical variables
    encoders = {
        'Motivation': LabelEncoder(),
        'Overall Health Status': LabelEncoder(),
        'Activity': LabelEncoder(),
        'Type': LabelEncoder(),
        'Intensity': LabelEncoder()
    }
    
    # Encode each categorical column
    for column, encoder in encoders.items():
        df[column] = encoder.fit_transform(df[column])
    
    # Rename columns for consistency and remove spaces
    df.columns = [col.replace(' ', '_') for col in df.columns]
    
    return df, encoders

# Function to save label encoders
def save_label_encoders(encoders, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(encoders, file)

# Function to load label encoders
def load_label_encoders(file_path):
    with open(file_path, 'rb') as file:
        encoders = pickle.load(file)
    return encoders

# Function to preprocess new data using loaded encoders
def preprocess_new_data(input_df, encoders):
    df = input_df.copy()
    for column, encoder in encoders.items():
        try:
            df[column] = encoder.transform(df[column])
        except:
            pass

    # Rename columns for consistency and remove spaces
    df.columns = [col.replace(' ', '_') for col in df.columns]
    
    return df

X = userTasksFeature.drop(['completion', 'interaction_id', 'taskid', 'userid', 'Name'] , axis=1)
y = userTasksFeature["completion"]

X, encoders = preprocess_for_decision_tree(X)
save_label_encoders(encoders, 'label_encoders.pkl')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=1)

##Build Model

In [9]:
model = GradientBoostingClassifier(random_state=1)
model.fit(X_train, y_train)

## Scoring

In [10]:
print("Accuracy on training set : ",model.score(X_train, y_train))
print("Accuracy on test set : ",model.score(X_test, y_test))

Accuracy on training set :  0.7524794260392488
Accuracy on test set :  0.7341772151898734


In [11]:
#Checking number of positives
y.value_counts(1)

completion
0    0.618839
1    0.381161
Name: proportion, dtype: float64

## Confusion Matrix

In [12]:
# Inference

y_test_pred = model.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_test_pred)

array([[633,  97],
       [218, 237]])

## Getting Predictions new a user

In [14]:

def create_prediction_for_user(user_df):
    loaded_encoders = load_label_encoders('label_encoders.pkl')
    user_inference_df = user_df.merge(tasks, how='cross') # cross join
    # create training data 
    user_processed_inference_df = preprocess_new_data(user_inference_df, loaded_encoders)
    user_processed_inference_df = user_processed_inference_df[["Age", "Motivation", "Overall_Health_Status", 
                                                               "Activity", "Type", "Duration_(minutes)", "Intensity"]]
    user_processed_inference_df = user_processed_inference_df.fillna(0)

    user_prediction = model.predict_proba(user_processed_inference_df)

    user_prediction = pd.DataFrame(user_prediction)
    output_df = pd.concat([user_inference_df, user_prediction], axis=1)
    output_df = output_df.sort_values(by=1, ascending=False)

    return output_df



In [15]:
data = {
    'Name': ['Nate Lee'],
    'Age': [35],
    'Motivation': ['Improve Sleep'],
    'Overall Health Status': ['Poor']
}

nate_df = pd.DataFrame(data)
nate_prediction = create_prediction_for_user(nate_df)
nate_prediction.head(20)

Unnamed: 0,Name,Age,Motivation,Overall Health Status,taskid,Activity,Type,Duration (minutes),Intensity,0,1
97,Nate Lee,35,Improve Sleep,Poor,97,Aerial yoga for 1 hour,Flexibility,60,Low,0.284516,0.715484
0,Nate Lee,35,Improve Sleep,Poor,0,Walking 1000 steps,Physical,10,Low,0.415617,0.584383
92,Nate Lee,35,Improve Sleep,Poor,92,Bocce ball for 1 hour,Leisure,60,Low,0.428135,0.571865
89,Nate Lee,35,Improve Sleep,Poor,89,Bowling for 1 hour,Leisure,60,Low,0.428135,0.571865
35,Nate Lee,35,Improve Sleep,Poor,35,Walking the dog for 30 minutes,Physical,30,Low,0.440379,0.559621
83,Nate Lee,35,Improve Sleep,Poor,83,Archery for 1 hour,Skill,60,Low,0.443844,0.556156
50,Nate Lee,35,Improve Sleep,Poor,50,Walking downhill for 30 minutes,Leisure,30,Low,0.45314,0.54686
40,Nate Lee,35,Improve Sleep,Poor,40,Walking in the park for 1 hour,Leisure,60,Low,0.456737,0.543263
1,Nate Lee,35,Improve Sleep,Poor,1,Sleep 8 hours,Rest,480,Low,0.459803,0.540197
69,Nate Lee,35,Improve Sleep,Poor,69,Stand-up desk for 8 hours,Leisure,480,Low,0.472651,0.527349


In [16]:
data = {
    'Name': ['Akshay Anand'],
    'Age': [40],
    'Motivation': ['Increase Energy'],
    'Overall Health Status': ['Excellent']
}

akshay_df = pd.DataFrame(data)
akshay_prediction = create_prediction_for_user(akshay_df)
akshay_prediction.head(20)

Unnamed: 0,Name,Age,Motivation,Overall Health Status,taskid,Activity,Type,Duration (minutes),Intensity,0,1
29,Akshay Anand,40,Increase Energy,Excellent,29,Zumba for 1 hour,Cardio,60,High,0.187972,0.812028
49,Akshay Anand,40,Increase Energy,Excellent,49,Walking uphill for 30 minutes,Cardio,30,High,0.245874,0.754126
5,Akshay Anand,40,Increase Energy,Excellent,5,Swimming for 1 hour,Cardio,60,High,0.268145,0.731855
32,Akshay Anand,40,Increase Energy,Excellent,32,Spinning for 1 hour,Cardio,60,High,0.268145,0.731855
19,Akshay Anand,40,Increase Energy,Excellent,19,Boxing for 1 hour,Cardio,60,High,0.272693,0.727307
66,Akshay Anand,40,Increase Energy,Excellent,66,Bungee jumping for 1 hour,Air,60,High,0.274865,0.725135
37,Akshay Anand,40,Increase Energy,Excellent,37,Climbing stairs for 15 minutes,Cardio,15,High,0.275541,0.724459
12,Akshay Anand,40,Increase Energy,Excellent,12,Rowing for 30 minutes,Cardio,30,High,0.276033,0.723967
57,Akshay Anand,40,Increase Energy,Excellent,57,Rowing machine for 30 minutes,Cardio,30,High,0.276033,0.723967
38,Akshay Anand,40,Increase Energy,Excellent,38,Jumping jacks for 10 minutes,Cardio,10,High,0.281331,0.718669


In [17]:
# Illustration of cold start 
data = {
    'Name': ['Zhenxuan'],
    'Age': [None],
    'Motivation': ['Build Muscle'],
    'Overall Health Status': [None]
}

zhenxuan_df = pd.DataFrame(data)
zhenxuan_prediction = create_prediction_for_user(zhenxuan_df)
zhenxuan_prediction.head(20)

Unnamed: 0,Name,Age,Motivation,Overall Health Status,taskid,Activity,Type,Duration (minutes),Intensity,0,1
30,Zhenxuan,,Build Muscle,,30,CrossFit for 1 hour,Strength,60,High,0.125253,0.874747
77,Zhenxuan,,Build Muscle,,77,Dodgeball for 1 hour,Sports,60,High,0.190195,0.809805
82,Zhenxuan,,Build Muscle,,82,Fencing for 1 hour,Sports,60,High,0.190195,0.809805
14,Zhenxuan,,Build Muscle,,14,Rock climbing for 1 hour,Strength,60,High,0.239781,0.760219
42,Zhenxuan,,Build Muscle,,42,Martial arts for 1 hour,Strength,60,High,0.268028,0.731972
19,Zhenxuan,,Build Muscle,,19,Boxing for 1 hour,Cardio,60,High,0.26935,0.73065
81,Zhenxuan,,Build Muscle,,81,Judo for 1 hour,Strength,60,High,0.272117,0.727883
79,Zhenxuan,,Build Muscle,,79,Krav Maga for 1 hour,Strength,60,High,0.272117,0.727883
37,Zhenxuan,,Build Muscle,,37,Climbing stairs for 15 minutes,Cardio,15,High,0.274229,0.725771
7,Zhenxuan,,Build Muscle,,7,Strength training for 1 hour,Strength,60,High,0.279522,0.720478
