# Classification on Motion Capture Dataset

## Metadata

In [29]:
name = "Aishwarya Prabhat"
gtUsername = "aprabhat7"
gtID = 903648486
dataset = "MoCap"

## Reading data

In [30]:
import pandas as pd

In [31]:
df = pd.read_csv("data/Postures.csv")

In [32]:
df.head()

Unnamed: 0,Class,User,X0,Y0,Z0,X1,Y1,Z1,X2,Y2,...,Z8,X9,Y9,Z9,X10,Y10,Z10,X11,Y11,Z11
0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,54.26388,71.466776,-64.807709,76.895635,42.4625,-72.780545,36.621229,81.680557,...,?,?,?,?,?,?,?,?,?,?
2,1,0,56.527558,72.266609,-61.935252,39.135978,82.53853,-49.596509,79.223743,43.254091,...,?,?,?,?,?,?,?,?,?,?
3,1,0,55.849928,72.469064,-62.562788,37.988804,82.631347,-50.606259,78.451526,43.567403,...,?,?,?,?,?,?,?,?,?,?
4,1,0,55.329647,71.707275,-63.688956,36.561863,81.868749,-52.752784,86.32063,68.214645,...,?,?,?,?,?,?,?,?,?,?


In [33]:
df.describe()

Unnamed: 0,Class,User,X0,Y0,Z0,X1,Y1,Z1,X2,Y2,Z2
count,78096.0,78096.0,78096.0,78096.0,78096.0,78096.0,78096.0,78096.0,78096.0,78096.0,78096.0
mean,2.983738,7.959127,50.345664,85.812051,-29.984712,49.595209,86.192647,-29.509202,48.612121,83.771315,-30.560515
std,1.421183,4.69781,32.696173,40.204363,34.361918,32.478238,40.453214,34.764398,33.60539,41.023543,35.120329
min,0.0,0.0,-108.552738,-98.233756,-126.770872,-111.685241,-96.142589,-166.006838,-106.886524,-100.789312,-129.595296
25%,2.0,5.0,29.295062,63.494432,-56.356438,28.755137,64.154529,-57.360107,25.170006,58.052385,-58.654059
50%,3.0,9.0,54.619964,86.526246,-30.864125,54.215514,87.542751,-30.184005,53.81458,86.458324,-32.352414
75%,4.0,12.0,72.488686,113.107355,-1.418803,71.762039,116.219398,-0.366692,71.561951,106.660827,-0.944786
max,5.0,14.0,190.017835,169.175464,113.345119,188.691997,170.20935,104.697852,188.760168,168.186466,104.590879


In [34]:
df.replace("?", 0, inplace=True)

## Splitting dataset into training and test sets

In [35]:
from sklearn.model_selection import train_test_split

X = df.drop(['Class','User'], axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1101)

## Training Models for hyperparameter tuning

### Decision trees with some form of pruning


In [92]:
from sklearn.tree import DecisionTreeClassifier

def decision_tree(X, y, max_depth=20):
    clf = DecisionTreeClassifier(max_depth=max_depth, random_state=1101)
    clf.fit(X, y)
    return clf

In [95]:
import mlflow
import time
import pickle
import sys

mlflow.set_experiment("{}-decision-tree-pruning".format(dataset))
mlflow.sklearn.autolog()

for max_depth in [5,10,20,30,50,100]:
    with mlflow.start_run(run_name="Max depth: "+str(max_depth)) as run:
        start = time.time()
        clf = decision_tree(X_train, y_train, max_depth)
        end = time.time()
        y_pred = clf.predict(X_test)
        size = sys.getsizeof(pickle.dumps(clf))
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_metric("test_acc", metrics.accuracy_score(y_test, y_pred))
        mlflow.log_metric("training_time", end-start)
        mlflow.log_metric("model_size", size)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




#### Parallael coordinates plot obtained from Mlflow for decision tree pruning

![](images/dectree_pcp.png)

### Neural networks

In [9]:
from sklearn.neural_network import MLPClassifier

def neural_network(X, y, hidden_layer_sizes=(5,50)):
    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,
                        random_state=1101)
    clf.fit(X,y)
    return clf

In [107]:
import mlflow
import time
import pickle
import sys

mlflow.set_experiment("{}-neural-network-optimization".format(dataset))
mlflow.sklearn.autolog()

for x in [1,2,5]:
    for y in [1,5,10,50,100]:
        with mlflow.start_run(run_name="layers={}, nodes={}".format(x,y)) as run:
            start = time.time()
            clf = neural_network(X_train, y_train, (y,x))
            end = time.time()
            y_pred = clf.predict(X_test)
            size = sys.getsizeof(pickle.dumps(clf))
            
            mlflow.log_param("layers", x)
            mlflow.log_param("nodes", y)
            mlflow.log_metric("test_acc", metrics.accuracy_score(y_test, y_pred))
            mlflow.log_metric("training_time", end-start)
            mlflow.log_metric("model_size", size)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


#### Parallael coordinates plot obtained from Mlflow for neural network layers-nodes tuning

![](images/nn_pcp.png)

### Boosting

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

def boosted_dt(X,y, n_estimators, max_depth):
    clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=1.0,
                                     max_depth=max_depth, random_state=0)
    clf.fit(X,y)
    return clf

In [18]:
import mlflow
import time
import pickle
import sys
from sklearn import metrics

mlflow.set_experiment("{}-boosting-optimization".format(dataset))
mlflow.sklearn.autolog()

for n_estimators in [10,20,50,100]:
    for max_depth in [1,5,10,20]: #20 worked for decision trees so can set it as max
        with mlflow.start_run(run_name="n_estimators={}, max_depth={}".format(n_estimators,max_depth)) as run:
            start = time.time()
            clf = boosted_dt(X_train, y_train,n_estimators, max_depth)
            end = time.time()
            y_pred = clf.predict(X_test)
            size = sys.getsizeof(pickle.dumps(clf))
            
#             mlflow.log_param("n_estimators", x)
#             mlflow.log_param("max_depth", y)
            mlflow.log_metric("test_acc", metrics.accuracy_score(y_test, y_pred))
            mlflow.log_metric("training_time", end-start)
            mlflow.log_metric("model_size", size)

  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


![](images/boosting_pcp.png)

### SVM

In [17]:
from sklearn.svm import SVC

def svm(X,y, kernel="rbf", max_iter=10):
    clf = SVC(kernel=kernel, max_iter=max_iter)
    clf.fit(X,y)
    return clf

In [None]:
import mlflow
import time
import pickle
import sys
from sklearn import metrics

mlflow.set_experiment("{}-svm-optimization".format(dataset))
mlflow.sklearn.autolog()

for kernel in ["linear", "poly", "rbf", "sigmoid"]:
    for max_iter in [2000, 5000, 10000]:
        with mlflow.start_run(run_name="kernel: {}, max_iter: {}".format(kernel, max_iter)) as run:
            start = time.time()
            clf = svm(X_train, y_train, kernel, max_iter)
            end = time.time()
            y_pred = clf.predict(X_test)
            size = sys.getsizeof(pickle.dumps(clf))

            mlflow.log_metric("test_acc", metrics.accuracy_score(y_test, y_pred))
            mlflow.log_metric("training_time", end-start)
            mlflow.log_metric("model_size", size)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

def knn(X,y,n_neighbors):
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(X,y)
    return clf

In [12]:
import mlflow
import time
import pickle
import sys
from sklearn import metrics

mlflow.set_experiment("{}-knn-optimization".format(dataset))
mlflow.sklearn.autolog()

fib = [1,5,8,13,35,56]
for n_neighbors in fib:
    with mlflow.start_run(run_name="N: "+str(n_neighbors)) as run:
        clf = knn(X_train, y_train, n_neighbors)
        start = time.time()
        y_pred = clf.predict(X_test)
        end = time.time()
        size = sys.getsizeof(pickle.dumps(clf))

        mlflow.log_metric("test_acc", metrics.accuracy_score(y_test, y_pred))
        mlflow.log_metric("inference_time", end-start)
        mlflow.log_metric("model_size", size)

## Comparing models

In [55]:
from sklearn.model_selection import learning_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


def get_learning_curve(estimator, X, y):
    train_sizes, train_scores, test_scores, fit_times, score_times = learning_curve(estimator, 
                                                                      X, y, 
                                                                      cv=5, #stratified k-fold
                                                                      n_jobs=4,
                                                                      shuffle=True,
                                                                      return_times=True)
    return train_sizes, train_scores, test_scores, fit_times, score_times

In [61]:
class ExperimentModel():
    def __init__(self, name, estimator):
        self.name = name
        self.estimator = estimator
        self.train_sizes = None
        self.train_scores = None
        self.test_scores  = None
        self.fit_times = None
        self.score_times = None
        
estimators = [
    ("Decision Tree", DecisionTreeClassifier(max_depth=20, random_state=1101)),
    ("Neural Network", MLPClassifier(hidden_layer_sizes=(50,5), random_state=1101)),
    ("Boosting", GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=10, random_state=1101)),
    ("SVM", SVC(kernel="rbf", max_iter=1000)),
    ("KNN", KNeighborsClassifier(n_neighbors=1))
]

models = []

In [None]:
from tqdm.notebook import tqdm

for estimator in tqdm(estimators):
    model = ExperimentModel(estimator[0], estimator[1])
    model.train_sizes, model.train_scores, model.test_scores, model.fit_times, model.score_times = get_learning_curve(estimator[1], X, y)
    models.append(model)
    

  0%|          | 0/5 [00:00<?, ?it/s]



In [None]:
import numpy as np

def generate_plots(models):
    fig, axes = plt.subplots(4, 1, figsize=(10, 15))
    
    #training performance vs no. samples
    axes[0].grid()
    axes[0].legend(loc="best")
    axes[0].set_xlabel("No. of training examples")
    axes[0].set_ylabel("Mean training score")
    axes[0].set_title("Training performance vs Dataset size")

    #test performance vs no. samples
    axes[1].grid()
    axes[1].legend(loc="best")
    axes[1].set_xlabel("No. of training examples")
    axes[1].set_ylabel("Mean test score")
    axes[1].set_title("Test performance vs Dataset size")

    #training time vs no. samples
    axes[2].grid()
    axes[2].legend(loc="best")
    axes[2].set_xlabel("No. of training examples")
    axes[2].set_ylabel("Mean training time (fit_time)")
    axes[2].set_title("Training scalability")
    
    #inference time vs no. samples
    axes[3].grid()
    axes[3].legend(loc="best")
    axes[3].set_xlabel("No. of training examples")
    axes[3].set_ylabel("Mean inference time (score_time)")
    axes[3].set_title("Inference scalability")
    
    for model in models:
        axes[0].plot(model.train_sizes, np.mean(model.train_scores))
    
    