In [1]:
import pandas as pd
import os

X = []  # Initialize an empty list to store concatenated values
y = []  # Initialize an empty list to store labels

# Iterate through transcript IDs from 300 to 493
for transcript_id in range(300, 494):
    filename = f'./transcript/{transcript_id}_TRANSCRIPT.csv'
    label_filename = "./CSV/final_data.csv"  # File containing labels
    
    # Check if the file exists before attempting to read it
    if os.path.exists(filename) and os.path.exists(label_filename):
        # Read the CSV file
        data = pd.read_csv(filename, sep='\t')
        
        # Filter rows where 'speaker' is "Participant" and select 'value' column
        participant_values = data[data['speaker'] == "Participant"]["value"]
        
        # Convert NaN values to empty strings
        participant_values = participant_values.fillna('')
        
        # Join participant values without a separator
        joined_values = ' '.join(participant_values.astype(str).tolist())
        
        # Append joined values to list X
        X.append(joined_values)
        
        # Read the label CSV file
        label_data = pd.read_csv(label_filename)
        
        # Find the corresponding row for the transcript ID
        label_row = label_data[label_data['Participant_ID'] == transcript_id]
        
        # Extract the label from the row
        if not label_row.empty:
            label = label_row.iloc[0]['PHQ8_Binary']
            y.append(label)
       
   

# Print the concatenated values and labels



In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
import os
import torch
import tensorflow as tf
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

# Directory where your fine-tuned model is saved
model_dir = 'models'

# Load the tokenizer and config from the directory
tokenizer = RobertaTokenizer.from_pretrained(model_dir)
config = RobertaConfig.from_pretrained(model_dir)

# Load the fine-tuned model
model = RobertaForSequenceClassification.from_pretrained(model_dir, config=config)

# Assuming X_train is your input data

# Encode the data
encoded_data = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt")

# Perform a forward pass through the model
with torch.no_grad():
    model.eval()
    outputs = model(**encoded_data, output_hidden_states=True)  # Set output_hidden_states to True

# Extract the hidden states from the RoBERTa model
hidden_states = outputs.hidden_states  # List of hidden states for all layers

# Get the last layer hidden states
last_hidden_states = hidden_states[-1]

# Convert the tensor to NumPy array
features = last_hidden_states.numpy()

# Convert the features to TensorFlow tensor
X3_tensor = tf.convert_to_tensor(features, dtype=tf.float16)

# Define a CNN model
model2 = tf.keras.Sequential([
    tf.keras.layers.Reshape(target_shape=(512, 768, 1), input_shape=(512, 768,)),  # Add a channel dimension
    tf.keras.layers.Conv2D(16, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=2),
    tf.keras.layers.Flatten(),  # Flatten the output
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(100)  # Output size (adjust as needed)
])

output = model2(X3_tensor)

# Convert the output to a NumPy array
features3 = output.numpy()
features3.shape

# Save the model
model2.save('roberta_cnn.h5')


  from .autonotebook import tqdm as notebook_tqdm




In [4]:
from sklearn.preprocessing import LabelBinarizer
import numpy as np

In [5]:
model2 = tf.keras.Sequential([
    tf.keras.layers.Reshape(target_shape=(512, 768, 1), input_shape=(512, 768,)),
    tf.keras.layers.Conv2D(16, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [6]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [9]:
print(np.unique(y_train))
print(np.unique(y_val))
print(np.unique(y_test))

[0 1]
[0 1]
[0 1]


In [10]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train)
y_val_onehot = lb.transform(y_val)
y_test_onehot = lb.transform(y_test)

In [21]:
X3_train = np.expand_dims(X3_tensor[:len(y_train)], axis=-1)
X3_val = np.expand_dims(X3_tensor[len(y_train):len(y_train)+len(y_val)], axis=-1)
X3_test = np.expand_dims(X3_tensor[len(y_train)+len(y_val):], axis=-1)

In [None]:
X3_test = X3_tensor[-len(y_test):]

In [25]:
print(len(X3_train), len(y_train_onehot))
print(len(X3_val), len(y_val_onehot))
print(len(X3_test), len(y_test_onehot))

108 108
28 28
34 34


In [27]:
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model2.fit(X3_train, y_train_onehot, epochs=50, batch_size=32, validation_data=(X3_val, y_val_onehot))