In [1]:
import mediapipe as mp
import cv2
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# Drawing helpers
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

### 1. Train Model

#### 1.1. Describe data and split dataset

In [2]:
def rescale_frame(frame, percent=50):
    '''
    Rescale a frame to a certain percentage compare to its original frame
    '''
    width = int(frame.shape[1] * percent/ 100)
    height = int(frame.shape[0] * percent/ 100)
    dim = (width, height)
    return cv2.resize(frame, dim, interpolation = cv2.INTER_AREA)


def describe_dataset(dataset_path: str):
    '''
    Describe dataset
    '''

    data = pd.read_csv(dataset_path)
    print(f"Headers: {list(data.columns.values)}")
    print(f'Number of rows: {data.shape[0]} \nNumber of columns: {data.shape[1]}\n')
    print(f"Labels: \n{data['label'].value_counts()}\n")
    print(f"Missing values: {data.isnull().values.any()}\n")
    
    duplicate = data[data.duplicated()]
    print(f"Duplicate Rows : {len(duplicate.sum(axis=1))}")

    return data


def round_up_metric_results(results) -> list:
    '''Round up metrics results such as precision score, recall score, ...'''
    return list(map(lambda el: round(el, 3), results))

In [3]:
# df = describe_dataset("./train.csv")
df = describe_dataset("train_augmented.csv")
# df.loc[df["label"] == "c", "label"] = 0
# df.loc[df["label"] == "i", "label"] = 1
# df.loc[df["label"] == "H", "label"] = 2
# df.loc[df["label"] == "L", "label"] = 3

label_map = {
    'c': 0,
    'bent_front_knee': 1,
    'arms_dropped': 2,
    'arms_bent': 3,
    'hips_rotated': 4,
    'back_foot_wrong_angle': 5,
    'leaning_forward': 6,
    'narrow_stance': 7,
    'shoulders_not_aligned': 8
}

df["label"] = df["label"].map(label_map)
reverse_map = {v: k for k, v in label_map.items()}



df.head(3)

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_eye_x', 'left_eye_y', 'left_eye_z', 'left_eye_v', 'right_eye_x', 'right_eye_y', 'right_eye_z', 'right_eye_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_elbow_x', 'left_elbow_y', 'left_elbow_z', 'left_elbow_v', 'right_elbow_x', 'right_elbow_y', 'right_elbow_z', 'right_elbow_v', 'left_index_x', 'left_index_y', 'left_index_z', 'left_index_v', 'right_index_x', 'right_index_y', 'right_index_z', 'right_index_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_heel_x', 'left_heel_y', 'left_heel_z', 'left_heel_v', 'right_heel_x', 'right_heel_y', 'right_heel_z', 'right_heel_v', 'left_foot_index_x', 'left_foot_index_y',

Unnamed: 0,label,nose_x,nose_y,nose_z,nose_v,left_eye_x,left_eye_y,left_eye_z,left_eye_v,right_eye_x,...,right_heel_z,right_heel_v,left_foot_index_x,left_foot_index_y,left_foot_index_z,left_foot_index_v,right_foot_index_x,right_foot_index_y,right_foot_index_z,right_foot_index_v
0,1,0.557098,0.278786,-0.148189,0.999986,0.552989,0.260272,-0.129918,0.999962,0.547961,...,-0.027741,0.971855,0.706543,0.990551,-0.121357,0.991646,0.314367,0.991941,-0.130842,0.988684
1,6,0.572447,0.270126,-0.228826,0.999997,0.571285,0.253527,-0.180903,0.999995,0.55445,...,-0.190807,0.964416,0.683649,0.921047,0.035104,0.976244,0.186465,0.947115,-0.36657,0.994817
2,2,0.553213,0.312252,-0.057507,0.99991,0.54391,0.302287,-0.032332,0.999891,0.538842,...,0.086074,0.996908,0.094275,0.806154,0.245689,0.958206,0.833871,0.799268,0.013756,0.981646


In [4]:
# Extract features and class
X = df.drop("label", axis=1)
y = df["label"].astype("int")

In [5]:
sc = StandardScaler()
# X = pd.DataFrame(sc.fit_transform(X))
X = pd.DataFrame(sc.fit_transform(X), columns=X.columns, index=X.index)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
y_test.head(3)

3150    6
2464    5
305     0
Name: label, dtype: int32

#### 1.2. Train model using Scikit-Learn and train set evaluation

In [7]:
algorithms =[("LR", LogisticRegression()),
         ("SVC", SVC(probability=True)),
         ('KNN',KNeighborsClassifier()),
         ("DTC", DecisionTreeClassifier()),
         ("SGDC", CalibratedClassifierCV(SGDClassifier())),
         ("NB", GaussianNB()),
         ('RF', RandomForestClassifier()),]

models = {}
final_results = []

for name, model in algorithms:
    trained_model = model.fit(X_train, y_train)
    models[name] = trained_model

    # Evaluate model
    model_results = model.predict(X_test)

    p_score = precision_score(y_test, model_results, average=None, labels=[0, 1, 2,3,4,5,6,7,8])
    a_score = accuracy_score(y_test, model_results)
    r_score = recall_score(y_test, model_results, average=None, labels=[0, 1, 2,3,4,5,6,7,8])
    f1_score_result = f1_score(y_test, model_results, average=None, labels=[0, 1, 2,3,4,5,6,7,8])
    cm = confusion_matrix(y_test, model_results, labels=[0, 1, 2,3,4,5,6,7,8])
    final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm))


In [8]:
# Sort results by F1 score
final_results.sort(key=lambda k: sum(k[4]), reverse=True)

pd.DataFrame(final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,LR,"[0.902, 0.987, 1.0, 0.974, 1.0, 1.0, 1.0, 1.0,...",0.984127,"[1.0, 0.975, 0.976, 0.962, 1.0, 0.988, 0.977, ...","[0.949, 0.981, 0.988, 0.968, 1.0, 0.994, 0.988...","[[83, 0, 0, 0, 0, 0, 0, 0, 0], [2, 78, 0, 0, 0..."
1,SGDC,"[0.93, 1.0, 0.976, 0.987, 1.0, 0.988, 0.966, 0...",0.981481,"[0.964, 1.0, 0.953, 0.975, 1.0, 0.977, 1.0, 0....","[0.947, 1.0, 0.964, 0.981, 1.0, 0.982, 0.983, ...","[[80, 0, 0, 0, 0, 1, 2, 0, 0], [0, 80, 0, 0, 0..."
2,DTC,"[0.459, 0.886, 0.788, 0.75, 0.956, 0.841, 0.92...",0.81746,"[0.614, 0.875, 0.788, 0.684, 0.945, 0.802, 0.8...","[0.526, 0.881, 0.788, 0.715, 0.95, 0.821, 0.87...","[[51, 6, 5, 6, 3, 5, 5, 2, 0], [9, 70, 1, 0, 0..."
3,SVC,"[0.677, 0.97, 0.656, 0.638, 0.976, 0.859, 0.97...",0.814815,"[0.783, 0.812, 0.694, 0.759, 0.912, 0.779, 0.8...","[0.726, 0.884, 0.674, 0.694, 0.943, 0.817, 0.9...","[[65, 0, 3, 5, 0, 3, 2, 2, 3], [4, 65, 4, 2, 0..."
4,RF,"[0.449, 0.895, 0.78, 0.72, 1.0, 0.778, 1.0, 0....",0.810847,"[0.578, 0.85, 0.753, 0.747, 0.945, 0.814, 0.88...","[0.505, 0.872, 0.766, 0.733, 0.972, 0.795, 0.9...","[[48, 6, 6, 8, 0, 10, 0, 1, 4], [9, 68, 2, 1, ..."
5,KNN,"[0.315, 0.662, 0.718, 0.469, 0.838, 0.443, 0.9...",0.57672,"[0.554, 0.588, 0.6, 0.57, 0.681, 0.547, 0.616,...","[0.402, 0.623, 0.654, 0.514, 0.752, 0.49, 0.75...","[[46, 5, 4, 11, 2, 5, 1, 2, 7], [14, 47, 4, 6,..."
6,NB,"[0.067, 0.155, 0.333, 0.231, 0.435, 0.3, 0.2, ...",0.195767,"[0.012, 0.6, 0.259, 0.038, 0.11, 0.035, 0.605,...","[0.02, 0.246, 0.291, 0.065, 0.175, 0.062, 0.30...","[[1, 36, 6, 2, 0, 2, 31, 2, 3], [2, 48, 6, 1, ..."


#### 1.3. Test set evaluation

In [16]:
# test_df = describe_dataset("./test.csv")
test_df = describe_dataset("test.csv")
test_df = test_df.sample(frac=1).reset_index(drop=True)

test_df.loc[test_df["label"] == "C", "label"] = 0
test_df.loc[test_df["label"] == "H", "label"] = 1
test_df.loc[test_df["label"] == "L", "label"] = 2

test_x = test_df.drop("label", axis=1)
test_y = test_df["label"].astype("int")

test_x = pd.DataFrame(sc.transform(test_x))

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_elbow_x', 'left_elbow_y', 'left_elbow_z', 'left_elbow_v', 'right_elbow_x', 'right_elbow_y', 'right_elbow_z', 'right_elbow_v', 'left_wrist_x', 'left_wrist_y', 'left_wrist_z', 'left_wrist_v', 'right_wrist_x', 'right_wrist_y', 'right_wrist_z', 'right_wrist_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_ankle_x', 'left_ankle_y', 'left_ankle_z', 'left_ankle_v', 'right_ankle_x', 'right_ankle_y', 'right_ankle_z', 'right_ankle_v', 'left_heel_x', 'left_heel_y', 'left_heel_z', 'left_heel_v', 'right_heel_x', 'right_heel_y', 'right_heel_z', 'right_heel_v', 'left_foot_index_x', 'lef

In [None]:
# Evaluate models on the test sets
testset_final_results = []

for name, model in models.items():
    # Evaluate model
    model_results = model.predict(test_x)

    p_score = precision_score(test_y, model_results, average=None, labels=[0, 1, 2])
    a_score = accuracy_score(test_y, model_results)
    r_score = recall_score(test_y, model_results, average=None, labels=[0, 1, 2])
    f1_score_result = f1_score(test_y, model_results, average=None, labels=[0, 1, 2])
    cm = confusion_matrix(test_y, model_results, labels=[0, 1, 2])
    testset_final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm ))


testset_final_results.sort(key=lambda k: sum(k[4]), reverse=True)
pd.DataFrame(testset_final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,LR,"[0.983, 0.996, 1.0]",0.992958,"[0.996, 0.996, 0.987]","[0.989, 0.996, 0.994]","[[233, 1, 0], [1, 240, 0], [3, 0, 232]]"
1,SVC,"[0.967, 1.0, 1.0]",0.988732,"[1.0, 0.992, 0.974]","[0.983, 0.996, 0.987]","[[234, 0, 0], [2, 239, 0], [6, 0, 229]]"
2,SGDC,"[0.978, 0.971, 1.0]",0.983099,"[0.97, 0.988, 0.991]","[0.974, 0.979, 0.996]","[[227, 7, 0], [3, 238, 0], [2, 0, 233]]"
3,KNN,"[0.87, 1.0, 1.0]",0.950704,"[1.0, 0.992, 0.86]","[0.93, 0.996, 0.924]","[[234, 0, 0], [2, 239, 0], [33, 0, 202]]"
4,RF,"[0.777, 1.0, 0.994]",0.904225,"[1.0, 0.992, 0.719]","[0.875, 0.996, 0.835]","[[234, 0, 0], [1, 239, 1], [66, 0, 169]]"
5,NB,"[0.887, 0.736, 0.949]",0.842254,"[0.637, 0.938, 0.949]","[0.741, 0.825, 0.949]","[[149, 73, 12], [15, 226, 0], [4, 8, 223]]"
6,DTC,"[0.66, 1.0, 0.703]",0.784507,"[0.722, 0.963, 0.664]","[0.69, 0.981, 0.683]","[[169, 0, 65], [8, 232, 1], [79, 0, 156]]"


#### 1.4. Dumped model and input scaler using pickle

According to the evaluations, there are multiple good models at the moment, therefore, the best models are LR and Ridge.

In [9]:
with open("./model/all_sklearn.pkl", "wb") as f:
    pickle.dump(models, f)

In [10]:
with open("./model/LR_model.pkl", "wb") as f:
    pickle.dump(models["LR"], f)

In [11]:
with open("./model/SVC_model.pkl", "wb") as f:
    pickle.dump(models["SVC"], f)

In [12]:
# Dump input scaler
with open("./model/input_scaler.pkl", "wb") as f:
    pickle.dump(sc, f)