In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pickle
import os

In [41]:
data = pd.read_csv("data_nonconcat.csv",header=None)
data = data.sample(frac=1)
data = data.reset_index(drop=True)
labels = data[0]
data = data.drop(0,axis='columns')
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.30)
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = np.asarray(y_train.values.tolist())
y_train = y_train.reshape(-1,1)
y_test = np.asarray(y_test.values.tolist())
y_test = y_test.reshape(-1,1)

In [42]:
print(data.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8820, 34)
(6174, 34)
(2646, 34)
(6174, 1)
(2646, 1)


In [61]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
def displayStats(model,modelname: str,test: np.ndarray,truth: np.ndarray):
    predictions = model.predict(test) #all of them should have this
    print(modelname+" STATS:")
    print("ACCURACY: "+str(accuracy_score(truth,predictions)))

    cmat = confusion_matrix(truth,predictions)
    print(cmat)
    # disp = ConfusionMatrixDisplay(cmat)
    # disp.plot()
    # plt.show()

# Catboost Classifier - Revision 3

This classifier is based upon the conclusions in rev2, and will attempt to create a multiclass frame classifier, and then apply it to classifying clips.

---

1) Import the concat vectors that represent each clip

In [44]:
data_C = pd.read_csv("data_concat.csv",header=None)
data_C = data_C.sample(frac=1)
data_C = data_C.reset_index(drop=True)
labels_C = data_C[0]
data_C = data_C.drop(0,axis='columns')
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(data_C, labels_C, test_size=0.30, random_state=42)
X_train_C = X_train_C.to_numpy()
X_test_C = X_test_C.to_numpy()
y_train_C = np.asarray(y_train_C.values.tolist())
y_train_C = y_train_C.reshape(-1,1)
y_test_C = np.asarray(y_test_C.values.tolist())
y_test_C = y_test_C.reshape(-1,1)

2) Reshape the data portions of the vector into frames

In [45]:
newdimx_train = (X_train_C.shape[0] * X_train_C.shape[1])/34
X_train_Cr = np.reshape(X_train_C,(int(newdimx_train),34))

newdimx_test = (X_test_C.shape[0]*X_test_C.shape[1])/34
X_test_Cr = np.reshape(X_test_C,(int(newdimx_test),34))

print("CONCAT DATA SHAPES AFTER RESHAPING")
print(X_train_Cr.shape)
print(X_test_Cr.shape)

CONCAT DATA SHAPES AFTER RESHAPING
(6120, 34)
(2700, 34)


3) Relabel for multiclass

In [55]:
print("LABEL SHAPES BEFORE STRETCHING")
print(y_train_C.shape)
print(y_test_C.shape)

y_train_Cr = np.asarray([])
y_test_Cr = np.asarray([])

# for each element, if element equals 1, insert 1-60 into the new label vector
# if element equals 0, insert -1 to -60 into the label vector
for label in y_train_C:
    if(label == 1):
        nlabels = np.arange(1,61,1)
        y_train_Cr = np.concatenate((y_train_Cr,nlabels))
    else:
        nlabels = np.arange(-1,-61,-1)
        y_train_Cr = np.concatenate((y_train_Cr,nlabels))

for label in y_test_C:
    if(label == 1):
        nlabels = np.arange(1,61,1)
        y_test_Cr = np.concatenate((y_test_Cr,nlabels))
    else:
        nlabels = np.arange(-1,-61,-1)
        y_test_Cr = np.concatenate((y_test_Cr,nlabels))
        


print("LABEL SHAPES AFTER STRETCHING")
y_train_Cr = y_train_Cr.reshape((-1,1))
print(y_train_Cr.shape)
y_test_Cr = y_test_Cr.reshape((-1,1))
print(y_test_Cr.shape)

LABEL SHAPES BEFORE STRETCHING
(102, 1)
(45, 1)
LABEL SHAPES AFTER STRETCHING
(6120, 1)
(2700, 1)


4) Train the multiclass classifier

In [56]:
from catboost import CatBoostClassifier
cbm = CatBoostClassifier(silent = True)
cbm.fit(X_train_Cr,y_train_Cr)

Learning rate set to 0.086786
0:	learn: 4.7465222	total: 640ms	remaining: 10m 39s
1:	learn: 4.7046243	total: 1.19s	remaining: 9m 54s
2:	learn: 4.6595340	total: 1.74s	remaining: 9m 39s
3:	learn: 4.6208275	total: 2.3s	remaining: 9m 32s
4:	learn: 4.5866880	total: 2.87s	remaining: 9m 30s
5:	learn: 4.5571579	total: 3.51s	remaining: 9m 40s
6:	learn: 4.5298827	total: 4.08s	remaining: 9m 38s
7:	learn: 4.4971941	total: 4.66s	remaining: 9m 37s
8:	learn: 4.4739053	total: 5.31s	remaining: 9m 44s
9:	learn: 4.4494157	total: 5.89s	remaining: 9m 43s
10:	learn: 4.4290965	total: 6.45s	remaining: 9m 39s
11:	learn: 4.4066570	total: 7.04s	remaining: 9m 39s
12:	learn: 4.3804687	total: 7.61s	remaining: 9m 37s
13:	learn: 4.3547582	total: 8.24s	remaining: 9m 40s
14:	learn: 4.3381192	total: 8.85s	remaining: 9m 41s
15:	learn: 4.3185379	total: 9.51s	remaining: 9m 44s
16:	learn: 4.3006854	total: 10.2s	remaining: 9m 50s
17:	learn: 4.2779549	total: 10.8s	remaining: 9m 47s
18:	learn: 4.2631910	total: 11.3s	remaining:

<catboost.core.CatBoostClassifier at 0x7f79a9f8a520>

5) Check frame by frame stats

In [62]:
displayStats(cbm, "CATBOOST REV 3", X_test_Cr, y_test_Cr)

CATBOOST REV 3 STATS:
ACCURACY: 0.06777777777777778
[[4 2 0 ... 0 1 3]
 [2 4 1 ... 2 1 3]
 [1 2 3 ... 0 0 3]
 ...
 [1 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 2]
 [0 0 0 ... 4 1 1]]
