In [None]:
import os
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# Load the transcript data and labels
transcript_dir = './transcript/'
label_file = "./CSV/final_data.csv"

# Function to preprocess transcript data and extract labels
def process_data(transcript_dir, label_file):
    X = []  # Initialize an empty list to store concatenated values
    y = []  # Initialize an empty list to store labels
    
    # Load labels
    label_data = pd.read_csv(label_file)
    
    # Iterate through transcript files
    for transcript_id in range(300, 494):
        filename = f'{transcript_dir}/{transcript_id}_TRANSCRIPT.csv'
        label_row = label_data[label_data['Participant_ID'] == transcript_id]
        
        # Check if transcript file and label exist
        if os.path.exists(filename) and not label_row.empty:
            # Read transcript data
            data = pd.read_csv(filename, sep='\t')
            
            # Filter participant values
            participant_values = data[data['speaker'] == "Participant"]["value"].fillna('')
            joined_values = ' '.join(participant_values.astype(str).tolist())
            X.append(joined_values)
            
            # Extract label
            label = label_row.iloc[0]['PHQ8_Binary']
            y.append(label)
    
    return X, y

# Process data
X, y = process_data(transcript_dir, label_file)

# Tokenize the data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input data
tokenized_data = tokenizer(X, padding=True, truncation=True, return_tensors="pt")

# Load pre-trained BERT model from directory
model_dir = 'model_bert'
config = BertConfig.from_pretrained(model_dir)
model = BertModel.from_pretrained(model_dir, config=config)

# Perform a forward pass through the model
with torch.no_grad():
    outputs = model(**tokenized_data, output_hidden_states=True)

# Extract the last layer hidden states
hidden_states = outputs.hidden_states[-1]

# Convert the tensor to NumPy array
features = hidden_states.numpy()

# Convert the features to TensorFlow tensor
X_bert_tensor = tf.convert_to_tensor(features, dtype=tf.float16)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_bert_tensor, y, test_size=0.2, random_state=42)

# Convert labels to one-hot encoding
lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train)
y_test_onehot = lb.transform(y_test)

# Define CNN model
model_cnn = tf.keras.Sequential([
    tf.keras.layers.Reshape(target_shape=(512, 768, 1), input_shape=(512, 768,)),  # Add a channel dimension
    tf.keras.layers.Conv2D(16, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=2),
    tf.keras.layers.Flatten(),  
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output size (adjust as needed)
])

# Compile the model
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_cnn.fit(X_train, y_train_onehot, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model_cnn.evaluate(X_test, y_test_onehot)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# Save the model
model_cnn.save('bert_cnn_model.h5')
