In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
base_dir = "data_new/"

## **Features**

In [3]:
# load users data
users = pd.read_csv(base_dir + "users_new.csv")
users.head()

Unnamed: 0,userid,Name,Age,Motivation,Overall Health Status
0,0,Gan Wei Ling,50,Weight Management,Good
1,1,Karthik Subramaniam,31,Weight Management,Poor
2,2,Christopher de Souza,31,Weight Management,Good
3,3,Lim Mei Ling,39,Social Interaction,Excellent
4,4,Ananda Krishna,54,Social Interaction,Fair


In [4]:
# load task data
tasks = pd.read_csv(base_dir + "tasks_new.csv")
tasks.head()

Unnamed: 0,taskid,Activity,Type,Duration (minutes),Intensity
0,0,Walking 1000 steps,Physical,10,Low
1,1,Sleep 8 hours,Rest,480,Low
2,2,Jogging for 30 minutes,Cardio,30,Medium
3,3,Yoga for 1 hour,Flexibility,60,Low
4,4,Cycling for 45 minutes,Cardio,45,Medium


In [5]:
# load user_task data
userTasks = pd.read_csv(base_dir + "usertasks_new.csv")
userTasks

Unnamed: 0,interaction_id,taskid,userid,completion
0,0,0,0,0
1,1,1,0,0
2,2,2,0,1
3,3,3,0,1
4,4,4,0,0
...,...,...,...,...
7786,7786,91,99,1
7787,7787,93,99,1
7788,7788,94,99,0
7789,7789,95,99,0


### Join users and tasks

In [6]:
#join users and tasks
userTasksFeature = pd.merge(userTasks, users, on=["userid"], how='left')
userTasksFeature = pd.merge(userTasksFeature, tasks, on=["taskid"], 
                            how='left')

userTasksFeature

Unnamed: 0,interaction_id,taskid,userid,completion,Name,Age,Motivation,Overall Health Status,Activity,Type,Duration (minutes),Intensity
0,0,0,0,0,Gan Wei Ling,50,Weight Management,Good,Walking 1000 steps,Physical,10,Low
1,1,1,0,0,Gan Wei Ling,50,Weight Management,Good,Sleep 8 hours,Rest,480,Low
2,2,2,0,1,Gan Wei Ling,50,Weight Management,Good,Jogging for 30 minutes,Cardio,30,Medium
3,3,3,0,1,Gan Wei Ling,50,Weight Management,Good,Yoga for 1 hour,Flexibility,60,Low
4,4,4,0,0,Gan Wei Ling,50,Weight Management,Good,Cycling for 45 minutes,Cardio,45,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...
7786,7786,91,99,1,Sandra Jones,35,Improve Sleep,Poor,Croquet for 1 hour,Leisure,60,Low
7787,7787,93,99,1,Sandra Jones,35,Improve Sleep,Poor,Shuffleboard for 1 hour,Leisure,60,Low
7788,7788,94,99,0,Sandra Jones,35,Improve Sleep,Poor,Trampolining for 1 hour,Fun,60,Medium
7789,7789,95,99,0,Sandra Jones,35,Improve Sleep,Poor,Parkour for 1 hour,Outdoor,60,High


### Data Engineernig - Update to categorical

userTasksFeature

## Split Data

In [7]:
userTasksFeature.columns

Index(['interaction_id', 'taskid', 'userid', 'completion', 'Name', 'Age',
       'Motivation', 'Overall Health Status', 'Activity', 'Type',
       'Duration (minutes)', 'Intensity'],
      dtype='object')

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# Preprocessing function for training data
def preprocess_for_decision_tree(df):
    # Define label encoders for categorical variables
    encoders = {
        'Motivation': LabelEncoder(),
        'Overall Health Status': LabelEncoder(),
        'Activity': LabelEncoder(),
        'Type': LabelEncoder(),
        'Intensity': LabelEncoder()
    }
    
    # Encode each categorical column
    for column, encoder in encoders.items():
        df[column] = encoder.fit_transform(df[column])
    
    # Rename columns for consistency and remove spaces
    df.columns = [col.replace(' ', '_') for col in df.columns]
    
    return df, encoders

# Function to save label encoders
def save_label_encoders(encoders, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(encoders, file)

# Function to load label encoders
def load_label_encoders(file_path):
    with open(file_path, 'rb') as file:
        encoders = pickle.load(file)
    return encoders

# Function to preprocess new data using loaded encoders
def preprocess_new_data(input_df, encoders):
    df = input_df.copy()
    for column, encoder in encoders.items():
        df[column] = encoder.transform(df[column])
    
    # Rename columns for consistency and remove spaces
    df.columns = [col.replace(' ', '_') for col in df.columns]
    
    return df

X = userTasksFeature.drop(['completion', 'interaction_id', 'taskid', 'userid', 'Name'] , axis=1)
y = userTasksFeature["completion"]

X, encoders = preprocess_for_decision_tree(X)
save_label_encoders(encoders, 'label_encoders.pkl')

In [9]:

# # Example usage with new data
# # new_data = {
# #     'Age': [35, 28],
# #     'Motivation': ['Social Interaction', 'Health'],
# #     'Overall Health Status': ['Good', 'Poor'],
# #     'Activity Type': ['Cardio', 'Strength'],
# #     'Duration (minutes)': [60, 45],
# #     'Intensity': ['Medium', 'High']
# # }

# df_new = pd.DataFrame(new_data)
# loaded_encoders = load_label_encoders('label_encoders.pkl')
# processed_new_df = preprocess_new_data(df_new, loaded_encoders)
# print(processed_new_df)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=1)

##Build Model

In [11]:
model = GradientBoostingClassifier(random_state=1)
model.fit(X_train, y_train)

## Scoring

In [12]:
print("Accuracy on training set : ",model.score(X_train, y_train))
print("Accuracy on test set : ",model.score(X_test, y_test))

Accuracy on training set :  0.7540115532734275
Accuracy on test set :  0.7607440667094291


In [13]:
#Checking number of positives
y.value_counts(1)

completion
0    0.613015
1    0.386985
Name: proportion, dtype: float64

## Confusion Matrix

In [14]:
# Inference

y_test_pred = model.predict(X_test)

In [15]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_test_pred)

array([[829, 139],
       [234, 357]])

## Getting Predictions new a user

In [16]:

def create_prediction_for_user(user_df):
    loaded_encoders = load_label_encoders('label_encoders.pkl')
    user_inference_df = user_df.merge(tasks, how='cross') # cross join
    # create training data 
    user_processed_inference_df = preprocess_new_data(user_inference_df, loaded_encoders)
    user_processed_inference_df = user_processed_inference_df[["Age", "Motivation", "Overall_Health_Status", 
                                                               "Activity", "Type", "Duration_(minutes)", "Intensity"]]
    # nate_processed_inference_df.drop(["Name"], axis=1)

    user_prediction = model.predict_proba(user_processed_inference_df)

    user_prediction = pd.DataFrame(user_prediction)
    output_df = pd.concat([user_inference_df, user_prediction], axis=1)
    output_df = output_df.sort_values(by=1, ascending=False)

    return output_df

In [17]:
data = {
    'Name': ['Nate Lee'],
    'Age': [35],
    'Motivation': ['Improve Sleep'],
    'Overall Health Status': ['Poor']
}

nate_df = pd.DataFrame(data)
nate_prediction = create_prediction_for_user(nate_df)
nate_prediction

Unnamed: 0,Name,Age,Motivation,Overall Health Status,taskid,Activity,Type,Duration (minutes),Intensity,0,1
1,Nate Lee,35,Improve Sleep,Poor,1,Sleep 8 hours,Rest,480,Low,0.282166,0.717834
3,Nate Lee,35,Improve Sleep,Poor,3,Yoga for 1 hour,Flexibility,60,Low,0.326526,0.673474
41,Nate Lee,35,Improve Sleep,Poor,41,Stretching for 30 minutes,Flexibility,30,Low,0.342058,0.657942
21,Nate Lee,35,Improve Sleep,Poor,21,Tai Chi for 1 hour,Flexibility,60,Low,0.345588,0.654412
6,Nate Lee,35,Improve Sleep,Poor,6,Meditation for 20 minutes,Mental,20,Low,0.347784,0.652216
...,...,...,...,...,...,...,...,...,...,...,...
45,Nate Lee,35,Improve Sleep,Poor,45,Ice skating for 1 hour,Outdoor,60,Medium,0.790208,0.209792
96,Nate Lee,35,Improve Sleep,Poor,96,Slacklining for 1 hour,Balance,60,Medium,0.790825,0.209175
27,Nate Lee,35,Improve Sleep,Poor,27,Canoeing for 1 hour,Water,60,Medium,0.796569,0.203431
55,Nate Lee,35,Improve Sleep,Poor,55,Playing baseball for 1 hour,Sports,60,Medium,0.798917,0.201083


In [19]:
data = {
    'Name': ['Akshay Anand'],
    'Age': [40],
    'Motivation': ['Increase Energy'],
    'Overall Health Status': ['Excellent']
}

akshay_df = pd.DataFrame(data)
akshay_prediction = create_prediction_for_user(akshay_df)
akshay_prediction

Unnamed: 0,Name,Age,Motivation,Overall Health Status,taskid,Activity,Type,Duration (minutes),Intensity,0,1
49,Akshay Anand,40,Increase Energy,Excellent,49,Walking uphill for 30 minutes,Cardio,30,High,0.139618,0.860382
15,Akshay Anand,40,Increase Energy,Excellent,15,Running 5 kilometers,Cardio,30,High,0.140604,0.859396
12,Akshay Anand,40,Increase Energy,Excellent,12,Rowing for 30 minutes,Cardio,30,High,0.142671,0.857329
57,Akshay Anand,40,Increase Energy,Excellent,57,Rowing machine for 30 minutes,Cardio,30,High,0.142671,0.857329
20,Akshay Anand,40,Increase Energy,Excellent,20,Skipping rope for 15 minutes,Cardio,15,High,0.144100,0.855900
...,...,...,...,...,...,...,...,...,...,...,...
0,Akshay Anand,40,Increase Energy,Excellent,0,Walking 1000 steps,Physical,10,Low,0.795428,0.204572
61,Akshay Anand,40,Increase Energy,Excellent,61,Snorkeling for 1 hour,Water,60,Low,0.796430,0.203570
70,Akshay Anand,40,Increase Energy,Excellent,70,Walking meeting for 1 hour,Social,60,Low,0.803724,0.196276
83,Akshay Anand,40,Increase Energy,Excellent,83,Archery for 1 hour,Skill,60,Low,0.809026,0.190974
