# 1. Read collected data & pre-process

In [1]:
# Import relevant dependencies
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [4]:
# Initialize main directory location
path = os.getcwd()
main_directory = os.path.dirname(path)

# Read combined .csv dataset file from previous data collection
df = pd.read_csv(main_directory + '/dataset/keypoints_combined.csv')

In [5]:
# Initialize dataset feature and target values
x = df.drop('class', axis=1) # features
y = df['class'] # target value

In [6]:
# Split data into 80/20 for training part and testing part
# Randomize dataset contents (to avoid possible over-fitting)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
print(y_test)

105    C
108    C
142    C
55     B
94     C
29     A
101    C
51     B
100    C
143    C
19     A
84     C
15     A
66     B
24     A
30     A
128    C
148    C
98     C
16     A
75     C
18     A
12     A
9      A
31     A
152    C
97     C
56     B
132    C
104    C
137    C
78     C
60     B
Name: class, dtype: object


# 2. Train machine learning classification model

In [8]:
# import relevant dependencies and model libraries
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler 

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [9]:
# Initialize training pipelines
pipelines = {
    'logistic-regression':make_pipeline(StandardScaler(), LogisticRegression()),
    'ridge-classifier':make_pipeline(StandardScaler(), RidgeClassifier()),
    'random-forest-classifier':make_pipeline(StandardScaler(), RandomForestClassifier()),
    'gradient-boosting-classifier':make_pipeline(StandardScaler(), GradientBoostingClassifier()),
}

In [10]:
fit_models = {}
for algo, pipeline in pipelines.items():
    model = pipeline.fit(x_train, y_train)
    fit_models[algo] = model

In [11]:
pipeline.fit

<bound method Pipeline.fit of Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gradientboostingclassifier', GradientBoostingClassifier())])>

# 3. Evaluate model

In [12]:
# Import dependencies
from sklearn.metrics import accuracy_score # Accuracy metrics 

In [13]:
# Run training evaluation
for algo, model in fit_models.items():
    yhat = model.predict(x_test)
    print(algo, accuracy_score(y_test, yhat))

logistic-regression 1.0
ridge-classifier 1.0
random-forest-classifier 1.0
gradient-boosting-classifier 1.0


In [14]:
fit_models['random-forest-classifier'].predict(x_test)

array(['C', 'C', 'C', 'B', 'C', 'A', 'C', 'B', 'C', 'C', 'A', 'C', 'A',
       'B', 'A', 'A', 'C', 'C', 'C', 'A', 'C', 'A', 'A', 'A', 'A', 'C',
       'C', 'B', 'C', 'C', 'C', 'C', 'B'], dtype=object)

In [15]:
y_test

105    C
108    C
142    C
55     B
94     C
29     A
101    C
51     B
100    C
143    C
19     A
84     C
15     A
66     B
24     A
30     A
128    C
148    C
98     C
16     A
75     C
18     A
12     A
9      A
31     A
152    C
97     C
56     B
132    C
104    C
137    C
78     C
60     B
Name: class, dtype: object

# 4. Serialize/Export model

In [18]:
# import dependencies
import pickle 

In [19]:
# Export trained model into specified directory as .pkl file
with open(main_directory+'/model/trained_classifier.pkl', 'wb') as f:
    pickle.dump(fit_models['random-forest-classifier'], f)