In [17]:
import os
import h5py
import numpy as np
from sklearn.model_selection import train_test_split

# Constants
h5_train_features = "../embeddings/features/features.h5"
h5_train_labels = "../embeddings/labels/labels.h5"
test_size = 0.20
seed = 9

def read_h5_file(file_path):
    h5f_data = h5py.File(file_path, "r")
    embedding_string = h5f_data["dataset_1"]
    print(embedding_string)
    embeddings = np.array(embedding_string)
    print(file_path,embeddings)
    h5f_data.close()
    return embeddings
  
global_features = read_h5_file(h5_train_features)
global_labels = read_h5_file(h5_train_labels)

# split the training and testing data
(
    trainFeaturesGlobal,
    testFeaturesGlobal,
    trainLabelsGlobal,
    testLabelsGlobal,
) = train_test_split(np.array(global_features), np.array(global_labels), test_size=test_size, random_state=seed)

print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainFeaturesGlobal.shape))
print("Test data   : {}".format(testFeaturesGlobal.shape))

<HDF5 dataset "dataset_1": shape (3010, 532), type "<f8">
../embeddings/features/features.h5 [[0.8974175  0.03450962 0.01845123 ... 0.02027887 0.12693291 0.96573218]
 [0.89815922 0.13025558 0.02774864 ... 0.02027767 0.12692423 0.96573354]
 [0.56777027 0.         0.01540143 ... 0.02027886 0.12693269 0.96573218]
 ...
 [0.95697685 0.01228793 0.00548476 ... 0.02027886 0.12693346 0.96573218]
 [0.97704002 0.10614054 0.03136325 ... 0.02027885 0.12692424 0.96573217]
 [0.95214074 0.03819411 0.03671892 ... 0.02027886 0.12692996 0.96573217]]
<HDF5 dataset "dataset_1": shape (3010,), type "<i8">
../embeddings/labels/labels.h5 [0 0 0 ... 3 3 3]
[STATUS] splitted train and test data...
Train data  : (2408, 532)
Test data   : (602, 532)


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pickle
import warnings

warnings.filterwarnings("ignore")

# create all the machine learning models
models = {
    "LR": LogisticRegression(random_state=seed),
    "KNN": KNeighborsClassifier(),
    "DTC": DecisionTreeClassifier(random_state=seed),
    "RF": RandomForestClassifier(random_state=seed),
    "LDA": LinearDiscriminantAnalysis(),
    "NB": GaussianNB(),
    "SVM": SVC(random_state=seed),
}

def train_and_save_model():
    # train the model
  print("[STATUS] training the model...")
  for name, model in models.items():
      model.fit(trainFeaturesGlobal, trainLabelsGlobal)
      # predict  the model
      predictions = model.predict(testFeaturesGlobal)
      # evaluate the model
      score = accuracy_score(testLabelsGlobal, predictions)
      print("Model: {} Accuracy: {}".format(name, score))
      
      # save the model
      if not os.path.exists("../models"):
        os.makedirs("../models")
        
      filename = "../models/{}.pkl".format(name)
      pickle.dump(model, open(filename, "wb"))
      print("[STATUS] saved model {}".format(filename))

train_and_save_model() 

[STATUS] training the model...
Model: LR Accuracy: 0.9136212624584718
[STATUS] saved model ../models/LR.pkl
Model: KNN Accuracy: 0.9086378737541528
[STATUS] saved model ../models/KNN.pkl
Model: DTC Accuracy: 0.893687707641196
[STATUS] saved model ../models/DTC.pkl
Model: RF Accuracy: 0.9700996677740864
[STATUS] saved model ../models/RF.pkl
Model: LDA Accuracy: 0.9119601328903655
[STATUS] saved model ../models/LDA.pkl
Model: NB Accuracy: 0.8205980066445183
[STATUS] saved model ../models/NB.pkl
Model: SVM Accuracy: 0.9019933554817275
[STATUS] saved model ../models/SVM.pkl
