In [1]:
import numpy as np
import os

def parse_hmm(fname):
    f = open(fname)
    line=f.readline()
    while line[0]!='#':
        line=f.readline()
    f.readline()
    f.readline()
    f.readline()
    f.readline()
    seq = []
    extras = np.zeros([0,10])
    prob = np.zeros([0,20])
    line = f.readline()
    while line[0:2]!='//':
        lineinfo = line.split()
        seq.append(lineinfo[0])
        probs_ = [2**(-float(lineinfo[i])/1000) if lineinfo[i]!='*' else 0. for i in range(2,22)]
        prob = np.concatenate((prob,np.matrix(probs_)),axis=0)

        line = f.readline()
        lineinfo = line.split()
        extras_ = [2**(-float(lineinfo[i])/1000) if lineinfo[i]!='*' else 0. for i in range(0,10)]
        extras = np.concatenate((extras,np.matrix(extras_)),axis=0)

        line = f.readline()
        assert len(line.strip())==0

        line = f.readline()
    return (''.join(seq),prob,extras)

def process_data(data_dir):
    """
    Read all HMM files in the given directory and process them.

    Args:
        data_dir (str): The path to the directory containing the HMM files.

    Returns:
        list: A list of tuples, each tuple contains the sequence, the probability matrix, the log-probability matrix, and the extra features.
    """
    data = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            filepath = os.path.join(data_dir, filename)
            seq, prob, extras = parse_hmm(filepath)
            data.append((seq, prob, extras))
    return data





In [None]:
def parse_pssm(fname):
    f = open(fname)
    # the 4th line should be the start of the PSSM data
    f.readline()
    f.readline()
    f.readline()
    seq = []
    lprob = np.zeros([0,20])
    prob = np.zeros([0,20])
    extra = np.zeros([0,2])
    line = f.readline()
    while len(line.strip())>0:
        lineinfo = line.split()
        seq.append(lineinfo[1])
        lprobs_ = [float(lineinfo[i]) for i in range(2,22)]
        lprob = np.concatenate((lprob,np.matrix(lprobs_)),axis=0)
        probs_ = [float(lineinfo[i])/100 for i in range(22,42)]
        prob = np.concatenate((prob,np.matrix(probs_)),axis=0)
        extras_ = [float(lineinfo[i]) for i in range(42,44)]
        extra = np.concatenate((extra,np.matrix(extras_)),axis=0)
        line = f.readline()

    return (''.join(seq),prob,lprob,extra)

def process_data_pssm(data_dir):
    """
    Read all PSSM files in the given directory and process them.

    Args:
        data_dir (str): The path to the directory containing the PSSM files.

    Returns:
        list: A list of tuples, each tuple contains the sequence, the probability matrix, the log-probability matrix, and the extra features.
    """
    data = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            filepath = os.path.join(data_dir, filename)
            seq, prob, lprob, extras = parse_pssm(filepath)
            data.append((seq, prob, lprob, extras))
    return data





In [2]:
import os
# Example usage for parsing HMM profiles
benchmark_hmm_file = "/content/drive/MyDrive/Dataset/Benchmark_HMM"
novel_hmm_file = "/content/drive/MyDrive/Dataset/Novel_HMM"

# Example usage for parsing PSSM profiles
benchmark_pssm_file = "/content/drive/MyDrive/Dataset/Benchmark_PSSM"
novel_pssm_file = "/content/drive/MyDrive/Dataset/Novel_PSSM"




In [3]:
bench_HMM = process_data(benchmark_hmm_file)



In [None]:
novel_HMM = process_data(novel_hmm_file)


In [None]:
bench_PSSM = process_data_pssm(benchmark_pssm_file)
novel_PSSM = process_data_pssm(novel_pssm_file)

In [4]:
bench_HMM

[('MNWRSERIWIELITGSRKTSNLCWACILFLGSLGFLLVGTSSYLGRNLISLFPSQQILFFPQGIVMSFYGIAGLFISSYLWCTILWNVGSGYDRFDRKEGIVCIFRWGFPGRNRRIFFRFLMRDIRSIRMEVKEGIYPRRVLSIEIRSQGSIPLTRTDENFTPREIEQKAAELAYFLRVPIEVFRTKEWILSRHGVGNPRILFNTTDLSSEQLLIRSKHVSVRSYFRSLLFPVCG',
  matrix([[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 1., 0.],
          ...,
          [0., 0., 0., ..., 1., 0., 0.],
          [0., 1., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]]),
  matrix([[1. , 0. , 0. , ..., 0.5, 1. , 1. ],
          [1. , 0. , 0. , ..., 0.5, 1. , 1. ],
          [1. , 0. , 0. , ..., 0.5, 1. , 1. ],
          ...,
          [1. , 0. , 0. , ..., 0.5, 1. , 1. ],
          [1. , 0. , 0. , ..., 0.5, 1. , 1. ],
          [1. , 0. , 0. , ..., 0.5, 1. , 1. ]])),
 ('MQLHMNLPTSRIAAGASINVRPAPLLRTAAPKRVCKHIVRAENNPSTPPPSSPSPPPPPPTPAAPTVTEVMGFSGAPEIINGRLAMLGFVAALGAELSTGESVLTQLGDQPTLIALTFVLFSAASLIPAFARRKGDAMGPFTPDAEMTNGRFAMIGFAAMLVYEGIQGIALF',
  matrix([[0.     

In [39]:
# Separate sequences and matrices for benchmark HMM data
bench_HMM_sequences = [item[0] for item in bench_HMM]
bench_HMM_matrices = [item[1:] for item in bench_HMM]

In [None]:


# Separate sequences and matrices for novel HMM data
novel_HMM_sequences = [item[0] for item in novel_HMM]
novel_HMM_matrices = [item[1:] for item in novel_HMM]  # This will contain prob and extras matrices


In [None]:
# Separate sequences and matrices for benchmark PSSM data
bench_PSSM_sequences = [item[0] for item in bench_PSSM]
bench_PSSM_matrices = [item[1:] for item in bench_PSSM]  # This will contain prob, lprob, and extras matrices

# Separate sequences and matrices for novel PSSM data
novel_PSSM_sequences = [item[0] for item in novel_PSSM]
novel_PSSM_matrices = [item[1:] for item in novel_PSSM]  # This will contain prob, lprob, and extras matrices


In [48]:
print(bench_HMM_matrices[1])

(matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.16828751, 0.01213233, 0.03041654, ..., 0.0434952 , 0.        ,
         0.        ],
        [0.07489431, 0.0119653 , 0.        , ..., 0.09479797, 0.01970902,
         0.02999779],
        [0.        , 0.        , 0.        , ..., 0.06973371, 0.03175219,
         0.03472215]]), matrix([[1.        , 0.        , 0.        , ..., 0.5       , 1.        ,
         1.        ],
        [1.        , 0.        , 0.        , ..., 0.5       , 1.        ,
         1.        ],
        [1.        , 0.        , 0.        , ..., 0.5       , 1.        ,
         1.        ],
        ...,
        [1.        , 0.        , 0.        , ..., 0.00957176, 1.        ,
         1.        ],
        [1.

In [6]:
import numpy as np
import pandas as pd

data= pd.read_csv("/content/drive/MyDrive/Dataset/Benchmark_BinaryML.csv")


In [32]:
data.head()

Unnamed: 0,PDBid,Sequence,envelope,lumen,plastoglobule,stroma,thylakoid_membrane,Sum,Length
0,Q2QD41,MIFSTFEHILTHISFSVISIVITIQLITLLINETVGLYVSSEKGMI...,0,0,0,0,1,1,323
1,Q9LIK0,MSQSIQFSTPSHTPHLLHLPHSQFNRPLSSISFRRFPLTTIKYTSI...,0,0,0,1,0,1,596
2,Q41643,MALAQKVASRPAVASRRGVVVVRASVESRRAVLGGLLASTVVALTS...,0,0,0,0,1,1,202
3,Q8WHX1,MIGRLYMKKLKNLFLFLSSLCPVFPWISQISLVMPFGLYYGFLTAL...,1,0,0,0,0,1,1703
4,O19901,MEQYILKLENSINILAFLGALVSSLFYWAKLTYYKQIQVFSLPKFC...,0,0,0,0,1,1,293


In [51]:
bench_HMM_matrices = [item[0] for item in bench_HMM_matrices]

In [52]:
import numpy as np
from scipy.interpolate import interp2d

def resize_matrix(matrix, target_shape):
    rows, cols = matrix.shape
    x_old = np.arange(cols)
    y_old = np.arange(rows)
    f = interp2d(x_old, y_old, matrix, kind='linear')

    x_new = np.linspace(0, cols - 1, target_shape[1])
    y_new = np.linspace(0, rows - 1, target_shape[0])

    resized_matrix = f(x_new, y_new)
    return resized_matrix

# Assuming bench_HMM_matrices_probs is your list of probability matrices
# For example, bench_HMM_matrices_probs = [matrix_probs1, matrix_probs2, ...]

# Define the target shape you want for your matrices
target_shape = (20, 20)  # Adjust desired_number_of_rows as needed

# Resize each probability matrix in bench_HMM_matrices_probs to the target shape
bench_HMM_matrices_probs = [
    resize_matrix(matrix_probs, target_shape) for matrix_probs in bench_HMM_matrices
]

# Now, resized_bench_HMM_matrices_probs contains resized probability matrices with the same shape (target_shape)


`interp2d` is deprecated in SciPy 1.10 and will be removed in SciPy 1.13.0.

For legacy code, nearly bug-for-bug compatible replacements are
`RectBivariateSpline` on regular grids, and `bisplrep`/`bisplev` for
scattered 2D data.

In new code, for regular grids use `RegularGridInterpolator` instead.
For scattered data, prefer `LinearNDInterpolator` or
`CloughTocher2DInterpolator`.

For more details see
`https://scipy.github.io/devdocs/notebooks/interp_transition_guide.html`

  f = interp2d(x_old, y_old, matrix, kind='linear')
        `interp2d` is deprecated in SciPy 1.10 and will be removed in SciPy 1.13.0.

        For legacy code, nearly bug-for-bug compatible replacements are
        `RectBivariateSpline` on regular grids, and `bisplrep`/`bisplev` for
        scattered 2D data.

        In new code, for regular grids use `RegularGridInterpolator` instead.
        For scattered data, prefer `LinearNDInterpolator` or
        `CloughTocher2DInterpolator`.

        For more details see


In [59]:
bench_HMM_matrices_probs

[array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [1.46296086e-02, 0.00000000e+00, 0.00000000e+00, 1.27977588e-02,
         1.37353262e-02, 3.66755671e-03, 0.00000000e+00, 5.02008478e-01,
         2.37165010e-02, 9.68885471e-02, 1.21534709e-02, 0.00000000e+00,
         5.54770540e-02, 2.57666528e-02, 1.03446955e-02, 6.39001480e-03,
         3.95284443e-02, 1.82784518e-01, 0.00000000e+00, 0.00000000e+00],
        [2.06309469e-01, 4.63464544e-02, 0.00000000e+00, 0.00000000e+00,
         1.16416619e-01, 4.23372235e-02, 0.00000000e+00, 1.55556727e-01,
         0.00000000e+00, 1.26090570e-01, 1.97151065e-02, 1.13389229e-02,
         6.72815850e-03, 0.00000000e+00, 3.372670

In [60]:
import pandas as pd
import numpy as np


# Create a dictionary to map sequences to their corresponding HMM matrices
sequence_to_hmm = dict(zip(bench_HMM_sequences, bench_HMM_matrices_probs))

# Function to add HMM matrices based on sequence
def add_hmm_matrix(row):
    sequence = row['Sequence']
    if sequence in sequence_to_hmm:
        return sequence_to_hmm[sequence]
    else:
        return np.nan

# Apply the add_hmm_matrix function to add HMM matrices to the dataset
data['HMM_Matrices'] = data.apply(add_hmm_matrix, axis=1)

# Now, your_dataset_df contains the 'HMM_Matrices' column with the corresponding HMM matrices


In [61]:
data.head(5)

Unnamed: 0,PDBid,Sequence,envelope,lumen,plastoglobule,stroma,thylakoid_membrane,Sum,Length,HMM_Matrices
0,Q2QD41,MIFSTFEHILTHISFSVISIVITIQLITLLINETVGLYVSSEKGMI...,0,0,0,0,1,1,323,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,Q9LIK0,MSQSIQFSTPSHTPHLLHLPHSQFNRPLSSISFRRFPLTTIKYTSI...,0,0,0,1,0,1,596,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,Q41643,MALAQKVASRPAVASRRGVVVVRASVESRRAVLGGLLASTVVALTS...,0,0,0,0,1,1,202,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.1865972979121191,..."
3,Q8WHX1,MIGRLYMKKLKNLFLFLSSLCPVFPWISQISLVMPFGLYYGFLTAL...,1,0,0,0,0,1,1703,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,O19901,MEQYILKLENSINILAFLGALVSSLFYWAKLTYYKQIQVFSLPKFC...,0,0,0,0,1,1,293,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [None]:
import numpy as np

def extract_features(matrix_list):
    # Calculate the mean and standard deviation of each HMM matrix
    mean_values = []
    std_dev_values = []

    for matrix in matrix_list:
        if matrix is not None:
            # Calculate mean and standard deviation along both axes (rows and columns)
            mean = np.mean(matrix)
            std_dev = np.std(matrix)
            mean_values.append(mean)
            std_dev_values.append(std_dev)
        else:
            # Handle missing or None matrices
            mean_values.append(np.nan)
            std_dev_values.append(np.nan)

    return mean_values, std_dev_values


In [None]:
# Apply the extract_features function to your dataset
data['HMM_Matrices_Mean'], data['HMM_Matrices_StdDev'] = extract_features(data['HMM_Matrices'])

# Now, your_dataset_df contains two additional columns: 'HMM_Matrices_Mean' and 'HMM_Matrices_StdDev'


In [62]:
data['HMM_Matrices'].shape

(578,)

In [None]:
import numpy as np

# Assuming bench_HMM_matrices is a list of matrices of varying shapes (L, 20)

# Find the maximum sequence length in your dataset
max_length = max(matrix.shape[0] for matrix in bench_HMM_matrices)

# Initialize a list to store the padded matrices
padded_matrices = []

# Perform padding for each matrix
for matrix in bench_HMM_matrices:
    rows_to_pad = max_length - matrix.shape[0]
    if rows_to_pad > 0:
        padded_matrix = np.pad(matrix, ((0, rows_to_pad), (0, 0)), mode='constant', constant_values=0)
    else:
        padded_matrix = matrix  # No padding needed for this matrix
    padded_matrices.append(padded_matrix)

# Now, padded_matrices contains matrices with the same shape (max_length, 20)


In [None]:
data['Padded_Matrices']=padded_matrices

In [63]:

# Assuming 'HMM_Matrices' contains the preprocessed matrices, and you have one-hot encoded target variables
X = np.array(data['HMM_Matrices'].tolist())
y = data[['envelope', 'lumen', 'plastoglobule', 'stroma', 'thylakoid_membrane']].values




In [2]:
X.shape, y.shape

NameError: ignored

In [None]:
def create_and_train_model(X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(Flatten(input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(5, activation='sigmoid'))  # Output layer with 5 neurons for the 5 subcellular locations

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                        epochs=10, batch_size=32, verbose=2, callbacks=[early_stopping])

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
    print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

    return model

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten,  Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Define K-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Adjust the number of folds as needed

# Initialize lists to store results
all_test_accuracy = []

# Loop through K-fold splits
for fold, (train_indices, test_indices) in enumerate(kfold.split(X, np.argmax(y, axis=1))):
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    # Create and train the model
    model = create_and_train_model(X_train, y_train, X_test, y_test)

    # Evaluate the model on the test set for this fold
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
    all_test_accuracy.append(test_accuracy)

    print(f'Fold {fold + 1} - Test Accuracy: {test_accuracy * 100:.2f}%')

# Calculate the average test accuracy across all folds
avg_test_accuracy = np.mean(all_test_accuracy)
print(f'Average Test Accuracy Across All Folds: {avg_test_accuracy * 100:.2f}%')

Epoch 1/10
15/15 - 6s - loss: 0.5778 - accuracy: 0.2706 - val_loss: 0.5266 - val_accuracy: 0.3190 - 6s/epoch - 433ms/step
Epoch 2/10
15/15 - 7s - loss: 0.4412 - accuracy: 0.4632 - val_loss: 0.5024 - val_accuracy: 0.3448 - 7s/epoch - 439ms/step
Epoch 3/10
15/15 - 5s - loss: 0.4015 - accuracy: 0.5498 - val_loss: 0.4883 - val_accuracy: 0.3879 - 5s/epoch - 334ms/step
Epoch 4/10
15/15 - 5s - loss: 0.3351 - accuracy: 0.6775 - val_loss: 0.4820 - val_accuracy: 0.3448 - 5s/epoch - 319ms/step
Epoch 5/10
15/15 - 7s - loss: 0.2690 - accuracy: 0.7684 - val_loss: 0.4727 - val_accuracy: 0.3103 - 7s/epoch - 442ms/step
Epoch 6/10
15/15 - 5s - loss: 0.1897 - accuracy: 0.8377 - val_loss: 0.4697 - val_accuracy: 0.3707 - 5s/epoch - 366ms/step
Epoch 7/10
15/15 - 6s - loss: 0.1298 - accuracy: 0.9026 - val_loss: 0.4602 - val_accuracy: 0.3879 - 6s/epoch - 415ms/step
Epoch 8/10
15/15 - 5s - loss: 0.0929 - accuracy: 0.9307 - val_loss: 0.4727 - val_accuracy: 0.3534 - 5s/epoch - 344ms/step
Epoch 9/10
15/15 - 5s - 

## optimizer


In [67]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [1]:
X_train.shape

NameError: ignored

In [69]:
import keras
from tensorflow.keras import layers
import tensorflow as tf

def build_model(hp):
    model1 = keras.Sequential([
        keras.layers.Conv2D(
            filters=hp.Int("conv1filter", min_value=30, max_value=200, step=10),
            kernel_size=hp.Choice("conv1kernel",values=[3,4,]),
            activation ='relu',
            input_shape=(462, 3707, 20)
        ),
        keras.layers.MaxPooling2D(pool_size=(2,2)),
        keras.layers.Conv2D(
            filters=hp.Int("conv2filter", min_value=30, max_value=200, step=10),
            kernel_size=hp.Choice("conv2kernel",values=[3,4]),
            activation='relu'
        ),
        keras.layers.MaxPooling2D(pool_size=(2,2)),
        keras.layers.Flatten(),
        keras.layers.Dense(
            units=hp.Int("units", min_value=32, max_value=100, step=10),
            activation="relu"
        ),
        keras.layers.Dense(
            units=5,
            activation="softmax"
        )
     ])
    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    model1.compile(
         optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         metrics=["accuracy"],
    )
    return model1

In [73]:
pip install keras_tuner

Collecting keras_tuner
  Downloading keras_tuner-1.4.5-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.5/129.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-core (from keras_tuner)
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Collecting namex (from keras-core->keras_tuner)
  Downloading namex-0.0.7-py3-none-any.whl (5.8 kB)
Installing collected packages: namex, kt-legacy, keras-core, keras_tuner
Successfully installed keras-core-0.1.7 keras_tuner-1.4.5 kt-legacy-1.0.5 namex-0.0.7


In [74]:
import keras_tuner
from keras_tuner import RandomSearch
tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective="val_accuracy",
    max_trials=4,
    directory="HDA",
    project_name="0",
)

Using TensorFlow backend


In [None]:
tuner.search(X_train, y_train, epochs=4, validation_split=0.2)


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
110               |110               |conv1filter
4                 |4                 |conv1kernel
130               |130               |conv2filter
3                 |3                 |conv2kernel
72                |72                |units
0.0001444         |0.0001444         |lr



In [None]:
models = tuner.get_best_models(num_models=2)
model1 = models[0]
model1.summary()

In [None]:
model1=model1.fit(X_train,y_train,epochs=9,validation_split=0.2)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(
    monitor= 'val_loss',  # Monitor the validation loss
    patience= 5,           # Number of epochs with no improvement after which training will be stopped
    restore_best_weights= True  # Restore the model weights to the best epoch
)

In [None]:
# Train the model
history= model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1, callbacks = [early_stopping])

# Evaluate the model on the test set
test_loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

In [None]:
y_pred = model.predict(X_test)

In [None]:
model1.save('speaker_detection_model.h5')

 ## Plots

In [None]:
epochs = list(range(14))
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(epochs, acc, label='train accuracy')
plt.plot(epochs, val_acc, label='val accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
#loss
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(epochs, loss, label='train loss')
plt.plot(epochs, val_loss, label='val loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

In [None]:
import numpy as np

# Assuming y_pred is one-hot encoded predictions
y_pred_classes = np.argmax(y_pred, axis=1)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_classes))