In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel
from gensim.models import KeyedVectors

# Load pre-trained transformer model and tokenizer (DistilBERT)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Load pre-trained word vectors (for feature-based encoding)
word_vectors = KeyedVectors.load_word2vec_format('path_to_pretrained_vectors.vec', binary=False)

# Assuming you have your preprocessed text data and labels
texts = ["example positive text", "example negative text", ...]
labels = [1, 0, ...]  # 1 for positive, 0 for negative

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Define a function to generate feature-based embeddings using pre-trained word vectors
def generate_feature_based_embedding(text):
    words = text.split()
    embeddings = [word_vectors[word] for word in words if word in word_vectors]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(300)  # Use zeros for out-of-vocabulary words

# Generate feature-based embeddings for training and testing data
train_feature_based_embeddings = np.array([generate_feature_based_embedding(text) for text in train_texts])
test_feature_based_embeddings = np.array([generate_feature_based_embedding(text) for text in test_texts])

# Tokenize and generate transformer-based embeddings
train_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    train_outputs = model(**train_inputs).last_hidden_state
transformer_embeddings = train_outputs.numpy()

# Concatenate feature-based embeddings and transformer embeddings
combined_embeddings = np.concatenate((train_feature_based_embeddings, transformer_embeddings), axis=1)

# Define and train an MLP model using PyTorch
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return self.sigmoid(x)

input_dim = 300 + transformer_embeddings.shape[1]  # Combined dimensions
hidden_dim1 = 128
hidden_dim2 = 64
mlp_model = MLP(input_dim, hidden_dim1, hidden_dim2)

# Convert data to PyTorch tensors
train_embeddings = torch.tensor(combined_embeddings, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.float32)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001)

# Train the MLP model
num_epochs = 10
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = mlp_model(train_embeddings)
    loss = criterion(outputs, train_labels.view(-1, 1))
    loss.backward()
    optimizer.step()

# Evaluate the model on the test data
test_feature_based_embeddings = np.array([generate_feature_based_embedding(text) for text in test_texts])
test_inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    test_outputs = model(**test_inputs).last_hidden_state
test_transformer_embeddings = test_outputs.numpy()

test_combined_embeddings = np.concatenate((test_feature_based_embeddings, test_transformer_embeddings), axis=1)
test_embeddings = torch.tensor(test_combined_embeddings, dtype=torch.float32)

with torch.no_grad():
    test_predictions = mlp_model(test_embeddings)

rounded_predictions = np.round(test_predictions.numpy())  # Convert probabilities to binary predictions
accuracy = accuracy_score(test_labels, rounded_predictions)
print("Test Accuracy:", accuracy)