In [None]:
import numpy as np
import os

# Function to load embeddings from saved files
def load_embeddings(directory):
    # Extract numeric ranges and sort filenames accordingly
    def extract_range(filename):
        start, end = map(int, filename.rstrip('.npy').split('_')[-2:])
        return start, end
    
    # Get sorted filenames
    sentence_embeddings_files = sorted([f for f in os.listdir(directory) if f.startswith('sentence_embeddings_') and f.endswith('.npy')],
                                       key=extract_range)
    text_embeddings_files = sorted([f for f in os.listdir(directory) if f.startswith('text_embeddings_') and f.endswith('.npy')],
                                   key=extract_range)
    
    sentence_embeddings_list = []
    text_embeddings_list = []
    
    for sentence_file, text_file in zip(sentence_embeddings_files, text_embeddings_files):
        print(f"Loading sentence embeddings from file: {sentence_file}")
        sentence_embeddings = np.load(os.path.join(directory, sentence_file))
        
        print(f"Loading text embeddings from file: {text_file}")
        text_embeddings = np.load(os.path.join(directory, text_file))
        
        sentence_embeddings_list.append(sentence_embeddings)
        text_embeddings_list.append(text_embeddings)
    
    all_sentence_embeddings = np.concatenate(sentence_embeddings_list, axis=0)
    all_text_embeddings = np.concatenate(text_embeddings_list, axis=0)
    
    return all_sentence_embeddings, all_text_embeddings

# Load embeddings for training
all_sentence_embeddings, all_text_embeddings = load_embeddings('/kaggle/input/embeddings/embeddings')
print("All Sentence Embeddings Shape:", all_sentence_embeddings.shape)
print("All Text Embeddings Shape:", all_text_embeddings.shape)



In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import os

# Load labels from CSV file
labels_df = pd.read_csv('/kaggle/input/offensive-humor-detection/Offensive_Humor_detection.csv')
labels = labels_df['joke_type'].values
print("Labels Shape:", labels.shape)

In [None]:
# Define the neural network architecture
max_sentences = all_sentence_embeddings.shape[1]  # Number of sentences per sample
embedding_dim = all_sentence_embeddings.shape[2]  # Embedding dimension

print(max_sentences)
print(embedding_dim)

In [None]:
# Convert labels to numpy array
labels = np.array(labels)
all_sentence_embeddings = np.array(all_sentence_embeddings)
all_text_embeddings = np.array(all_text_embeddings)

In [None]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from sklearn.model_selection import train_test_split

# Define the necessary parameters
num_sentences = 6  # Replace with your actual number of sentences per sample
embedding_dim = 768  # Dimension of the BERT embeddings
num_classes = 4  # Number of classes for classification

# Input shape for sentence embeddings
input_sentences = Input(shape=(num_sentences, embedding_dim))

# Define parallel lines for sentence embeddings
sentence_outputs = []
for _ in range(num_sentences):
    x = Dense(128, activation='relu')(input_sentences[:, _, :])
    x = Dropout(0.2)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    sentence_output = Dense(20, activation='relu')(x)
    sentence_outputs.append(sentence_output)

# Input shape for text embeddings
input_text = Input(shape=(embedding_dim,))

# Define parallel lines for text embeddings
text_outputs = []
for _ in range(3):
    x = Dense(128, activation='relu')(input_text)
    x = Dropout(0.2)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    text_output = Dense(60, activation='relu')(x)
    text_outputs.append(text_output)

# Concatenate the outputs of sentence and text branches
concatenated = Concatenate()(sentence_outputs + text_outputs)

# Define a fully connected layer
x = Dense(256, activation='relu')(concatenated)
x = Dropout(0.2)(x)

# Define the sequential layers
sequential_layers = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# Connect the fully connected layer with the sequential layers
output = sequential_layers(x)

# Build the model
model = Model(inputs=[input_sentences, input_text], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


In [None]:
# Example data
num_samples = 21860  # Replace with your actual number of samples
num_sentences = 6  # Replace with your actual number of sentences per sample
embedding_dim = 768  # Typically the output dimension of BERT embeddings

# Replace these with your actual data
X_sentence_embeddings = all_sentence_embeddings  # This should be of shape (num_samples, num_sentences, embedding_dim)
X_text_embeddings = all_text_embeddings  # This should be of shape (num_samples, embedding_dim)
y_labels = labels  # This should be of shape (num_samples,)

# Ensure your data is the correct shape
assert X_sentence_embeddings.shape == (num_samples, num_sentences, embedding_dim)
assert X_text_embeddings.shape == (num_samples, embedding_dim)
assert y_labels.shape[0] == num_samples

# Split data into training and validation sets
X_train_sentences, X_val_sentences, X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_sentence_embeddings, X_text_embeddings, y_labels, test_size=0.2, random_state=42
)

# Train the model
history = model.fit(
    [X_train_sentences, X_train_text],
    y_train,
    validation_data=([X_val_sentences, X_val_text], y_val),
    epochs=10,  # Set the number of epochs
    batch_size=32,  # Set the batch size
    verbose=1  # Print progress during training
)
