In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from collections import Counter
import numpy as np
import pandas as pd
import re

# Load your normalized_data
data = pd.read_csv("normalized_data.csv")

# Encode labels as integers
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])  # 0 for negative, 1 for positive




In [None]:
# Prepare the vocabulary
all_tokens = [token for tokens in data['normalized_tokens'].apply(eval).tolist() for token in tokens]
vocab = Counter(all_tokens)
vocab = {word: idx + 1 for idx, (word, _) in enumerate(vocab.most_common())}  # Reserve index 0 for padding
vocab_size = len(vocab) + 1  # Add one for padding index

# Convert normalized_tokens into sequences of indices
def tokens_to_indices(tokens, vocab, max_len=100):
    indices = [vocab.get(token, 0) for token in tokens[:max_len]]  # Truncate or pad tokens
    return indices + [0] * (max_len - len(indices))

max_len = 60  # Set maximum sequence length
data['token_indices'] = data['normalized_tokens'].apply(lambda x: tokens_to_indices(eval(x), vocab, max_len))

# Split into training and testing sets
X = np.array(data['token_indices'].tolist())
y = np.array(data['label_encoded'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [None]:
# Define Dataset class
class SentimentDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = torch.tensor(inputs, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

# Create Dataloaders
train_dataset = SentimentDataset(X_train, y_train)
test_dataset = SentimentDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define BiLSTM-CNN Hybrid Model
class BiLSTM_CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes, kernel_sizes, num_filters):
        super(BiLSTM_CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True, bidirectional=True)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, 2 * hidden_size)) for k in kernel_sizes
        ])
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, seq_len, embed_size]
        lstm_out, _ = self.lstm(x)  # [batch_size, seq_len, 2 * hidden_size]
        lstm_out = lstm_out.unsqueeze(1)  # Add channel dimension: [batch_size, 1, seq_len, 2 * hidden_size]
        conv_outs = [torch.relu(conv(lstm_out)).squeeze(3) for conv in self.convs]  # Apply each Conv2D
        conv_outs = [torch.max(pool, 2)[0] for pool in conv_outs]  # Max pooling
        x = torch.cat(conv_outs, 1)  # Concatenate feature maps
        x = self.dropout(x)
        x = self.fc(x)
        return x



In [None]:
# Hyperparameters
embed_size = 128
hidden_size = 64
num_classes = len(label_encoder.classes_)
kernel_sizes = [3, 4, 5]
num_filters = 100

# Initialize model, loss, and optimizer
model = BiLSTM_CNN(vocab_size, embed_size, hidden_size, num_classes, kernel_sizes, num_filters)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)  # Added weight decay



In [None]:
# Training loop
def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=10):
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)  # Learning rate scheduler
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        scheduler.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")
        evaluate_model(model, test_loader)

In [None]:

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {accuracy:.4f}")



In [None]:
# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, epochs=10)

Epoch 1/10, Loss: 0.5551
Validation Accuracy: 0.7587
Epoch 2/10, Loss: 0.3770
Validation Accuracy: 0.7827
Epoch 3/10, Loss: 0.2110
Validation Accuracy: 0.7786
Epoch 4/10, Loss: 0.0814
Validation Accuracy: 0.7846
Epoch 5/10, Loss: 0.0477
Validation Accuracy: 0.7829
Epoch 6/10, Loss: 0.0337
Validation Accuracy: 0.7791
Epoch 7/10, Loss: 0.0219
Validation Accuracy: 0.7865
Epoch 8/10, Loss: 0.0175
Validation Accuracy: 0.7834
Epoch 9/10, Loss: 0.0159
Validation Accuracy: 0.7820
Epoch 10/10, Loss: 0.0134
Validation Accuracy: 0.7822


In [None]:
# Save the model's state dictionary
torch.save(model.state_dict(), "bilstm_cnn_model.pth")

# Save the entire model (optional, but larger file size)
torch.save(model, "bilstm_cnn_model_full.pth")

print("Model saved successfully!")


Model saved successfully!


In [None]:
# Sauvegarder le vocabulaire
with open("vocablstm.pkl", "wb") as f:
    pickle.dump(vocab, f)

# Sauvegarder le label encoder
with open("label_encoderlstm.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("✅ Vocabulaire et label encoder sauvegardés sous 'vocab.pkl' et 'label_encoder.pkl'.")


✅ Vocabulaire et label encoder sauvegardés sous 'vocab.pkl' et 'label_encoder.pkl'.
