In [1]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')



In [2]:
def describe_dataset(dataset_path):
    data = pd.read_csv(dataset_path)
    print(f"Headers: {list(data.columns.values)}")
    print(f'Number of rows: {data.shape[0]} \nNumber of columns: {data.shape[1]}\n')
    print(f"Labels: \n{data['label'].value_counts()}\n")
    print(f"Missing values: {data.isnull().values.any()}\n")
    
    duplicate = data[data.duplicated()]
    print(f"Duplicate Rows : {len(duplicate.sum(axis=1))}")

    return data

df = describe_dataset("train.csv")

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'right_elbow_x', 'right_elbow_y', 'right_elbow_z', 'right_elbow_v', 'left_elbow_x', 'left_elbow_y', 'left_elbow_z', 'left_elbow_v', 'right_wrist_x', 'right_wrist_y', 'right_wrist_z', 'right_wrist_v', 'left_wrist_x', 'left_wrist_y', 'left_wrist_z', 'left_wrist_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v']
Number of rows: 15372 
Number of columns: 37

Labels: 
label
C    8238
L    7134
Name: count, dtype: int64

Missing values: False

Duplicate Rows : 0


In [3]:
# Categorizing label
df.loc[df["label"] == "C", "label"] = 0
df.loc[df["label"] == "L", "label"] = 1

In [4]:
sc = StandardScaler()

In [5]:
with open("../../model/input_scaler.pkl","rb") as f:
    sc = pickle.load(f)

In [6]:
# Standard Scaling of features
x = df.drop("label", axis = 1)
x = pd.DataFrame(sc.transform(x))

y = df["label"].astype('int')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)
y_train

9465     1
8833     0
6190     0
7645     0
13890    1
        ..
11468    1
7221     1
1318     1
8915     1
11055    1
Name: label, Length: 12297, dtype: int64

In [8]:
def round_up_metric_results(results) -> list:
    '''Round up metrics results such as precision score, recall score, ...'''
    return list(map(lambda el: round(el, 3), results))

algorithms = [("LogisticRegression",LogisticRegression()),
              ("KNeighborsClassifier",KNeighborsClassifier()),
              ("DecisionTreeClassifier",DecisionTreeClassifier()),
              ("RandomForestClassifier",RandomForestClassifier()),]

models = {}
final_results = []

for name, model in algorithms:
    trained_model = model.fit(X_train, y_train)
    models[name] = trained_model

    # Evaluate model
    model_results = model.predict(X_test)

    p_score = precision_score(y_test, model_results, average=None, labels=[0, 1])
    a_score = accuracy_score(y_test, model_results)
    r_score = recall_score(y_test, model_results, average=None, labels=[0, 1])
    f1_score_result = f1_score(y_test, model_results, average=None, labels=[0, 1])
    cm = confusion_matrix(y_test, model_results, labels=[0, 1])
    final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm))


pd.DataFrame(final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,LogisticRegression,"[0.986, 0.976]",0.981138,"[0.98, 0.983]","[0.983, 0.979]","[[1645, 34], [24, 1372]]"
1,KNeighborsClassifier,"[0.997, 0.999]",0.998049,"[0.999, 0.996]","[0.998, 0.998]","[[1678, 1], [5, 1391]]"
2,DecisionTreeClassifier,"[0.998, 0.992]",0.995122,"[0.993, 0.997]","[0.996, 0.995]","[[1668, 11], [4, 1392]]"
3,RandomForestClassifier,"[0.999, 0.999]",0.999024,"[0.999, 0.999]","[0.999, 0.999]","[[1677, 2], [1, 1395]]"


In [10]:
with open("../../model/all_sklearn.pkl", "wb") as f:
    pickle.dump(models, f)

# Test

In [None]:
# load dataset
test_df = describe_dataset("./test.csv")

# Categorizing label
test_df.loc[test_df["label"] == "C", "label"] = 0
test_df.loc[test_df["label"] == "L", "label"] = 1

# Standard Scaling of features
test_x = test_df.drop("label", axis = 1)
test_x = pd.DataFrame(sc.transform(test_x))

test_y = test_df["label"].astype('int')


Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'right_elbow_x', 'right_elbow_y', 'right_elbow_z', 'right_elbow_v', 'left_elbow_x', 'left_elbow_y', 'left_elbow_z', 'left_elbow_v', 'right_wrist_x', 'right_wrist_y', 'right_wrist_z', 'right_wrist_v', 'left_wrist_x', 'left_wrist_y', 'left_wrist_z', 'left_wrist_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v']
Number of rows: 604 
Number of columns: 37

Labels: 
label
C    339
L    265
Name: count, dtype: int64

Missing values: False

Duplicate Rows : 0


In [None]:
testset_final_results = []

for name, model in models.items():
    # Evaluate model
    model_results = model.predict(test_x)

    p_score = precision_score(test_y, model_results, average="macro")
    a_score = accuracy_score(test_y, model_results)
    r_score = recall_score(test_y, model_results, average="macro")
    f1_score_result = f1_score(test_y, model_results, average="macro")
    cm = confusion_matrix(test_y, model_results, labels=[0, 1])
    testset_final_results.append(( name,  p_score, r_score, a_score, f1_score_result, cm ))


evaluation = pd.DataFrame(testset_final_results, columns=["Model", "Precision Score", "Recall Score", "Accuracy Score", "F1 Score", "Confusion Matrix"])

evaluation

Unnamed: 0,Model,Precision Score,Recall Score,Accuracy Score,F1 Score,Confusion Matrix
0,LogisticRegression,0.793933,0.739662,0.763245,0.742615,"[[316, 23], [120, 145]]"
1,KNeighborsClassifier,0.975401,0.968336,0.971854,0.97123,"[[338, 1], [16, 249]]"
2,DecisionTreeClassifier,0.724533,0.702961,0.720199,0.705023,"[[286, 53], [116, 149]]"
3,RandomForestClassifier,0.956873,0.939623,0.94702,0.945336,"[[339, 0], [32, 233]]"
