In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset

# First Channel - BERT

# Define the paths to the train and test data files
train_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/500/train_400.csv'
test_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/500/test_100.csv'

# Define the dataset class
class CustomDataset(Dataset):
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

# Define the BERT model and training parameters
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer_bert = torch.optim.AdamW(model_bert.parameters(), lr=2e-5)
criterion_bert = torch.nn.CrossEntropyLoss()

# Create the data loaders
train_dataset_bert = CustomDataset(train_data_path)
test_dataset_bert = CustomDataset(test_data_path)
train_loader_bert = DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=8)

# Training loop for BERT
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_bert.to(device)

for epoch in range(10):
    train_loss_bert = 0.0
    train_acc_bert = 0.0

    model_bert.train()
    for batch in train_loader_bert:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer_bert.zero_grad()

        outputs = model_bert(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer_bert.step()

        train_loss_bert += loss.item() * input_ids.size(0)
        _, preds = torch.max(logits, dim=1)
        train_acc_bert += accuracy_score(labels.cpu(), preds.cpu()) * input_ids.size(0)

    train_loss_bert = train_loss_bert / len(train_dataset_bert)
    train_acc_bert = train_acc_bert / len(train_dataset_bert)

    # Evaluation on the test set
    model_bert.eval()
    test_loss_bert = 0.0
    test_acc_bert = 0.0

    with torch.no_grad():
        for batch in test_loader_bert:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model_bert(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            test_loss_bert += loss.item() * input_ids.size(0)
            _, preds = torch.max(logits, dim=1)
            test_acc_bert += accuracy_score(labels.cpu(), preds.cpu()) * input_ids.size(0)

        test_loss_bert = test_loss_bert / len(test_dataset_bert)
        test_acc_bert = test_acc_bert / len(test_dataset_bert)

    print(f'Epoch {epoch + 1}/{10}:')
    print(f'BERT - Train Loss: {train_loss_bert:.4f}, Train Accuracy: {train_acc_bert:.4f}')
    print(f'BERT - Test Loss: {test_loss_bert:.4f}, Test Accuracy: {test_acc_bert:.4f}')

    # Perform PCA dimensionality reduction on BERT embeddings
    train_embeddings_bert = []
    for batch in train_loader_bert:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model_bert.base_model(input_ids, attention_mask=attention_mask)
            embeddings = outputs.pooler_output

        train_embeddings_bert.append(embeddings.cpu().numpy())

    train_embeddings_bert = np.concatenate(train_embeddings_bert)
    pca_bert = PCA(n_components=128)
    train_embeddings_reduced_bert = pca_bert.fit_transform(train_embeddings_bert)

    print(f"BERT - Original dimension: {train_embeddings_bert.shape[1]}")
    print(f"BERT - Reduced dimension: {train_embeddings_reduced_bert.shape[1]}")


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Epoch 1/10:
BERT - Train Loss: 0.4812, Train Accuracy: 0.7825
BERT - Test Loss: 0.2552, Test Accuracy: 0.9200
BERT - Original dimension: 768
BERT - Reduced dimension: 128
Epoch 2/10:
BERT - Train Loss: 0.1715, Train Accuracy: 0.9450
BERT - Test Loss: 0.2300, Test Accuracy: 0.9300
BERT - Original dimension: 768
BERT - Reduced dimension: 128
Epoch 3/10:
BERT - Train Loss: 0.0478, Train Accuracy: 0.9900
BERT - Test Loss: 0.2810, Test Accuracy: 0.9300
BERT - Original dimension: 768
BERT - Reduced dimension: 128
Epoch 4/10:
BERT - Train Loss: 0.0115, Train Accuracy: 1.0000
BERT - Test Loss: 0.3204, Test Accuracy: 0.9300
BERT - Original dimension: 768
BERT - Reduced dimension: 128
Epoch 5/10:
BERT - Train Loss: 0.0072, Train Accuracy: 1.0000
BERT - Test Loss: 0.2835, Test Accuracy: 0.9400
BERT - Original dimension: 768
BERT - Reduced dimension: 128
Epoch 6/10:
BERT - Train Loss: 0.0029, Train Accuracy: 1.0000
BERT - Test Loss: 0.3013, Test Accuracy: 0.9500
BERT - Original dimension: 768
BERT

In [None]:
print("BERT - Original dimension embeddings:")
print(f"BERT - Original dimension: {train_embeddings_bert.shape[1]}")
print(train_embeddings_bert)



BERT - Original dimension embeddings:
BERT - Original dimension: 768
[[ 0.4666869   0.5344823   0.9346733  ...  0.94740856 -0.1777724
   0.64746004]
 [-0.01455891 -0.6917085  -0.99928176 ... -0.99966013  0.29300606
  -0.75974464]
 [ 0.29118216  0.8456919   0.9998205  ...  0.99907     0.23862027
   0.66990715]
 ...
 [-0.35591373 -0.73133457 -0.99921644 ... -0.9996022   0.14060439
  -0.554288  ]
 [-0.2543459  -0.69907224 -0.99851483 ... -0.999387    0.24491003
  -0.59984803]
 [ 0.07382118  0.8426459   0.9999542  ...  0.99874455  0.20625168
   0.7600613 ]]


In [None]:
print("BERT - Reduced dimension embeddings:")
print(f"BERT - Reduced dimension: {train_embeddings_reduced_bert.shape[1]}")
print(train_embeddings_reduced_bert)

BERT - Reduced dimension embeddings:
BERT - Reduced dimension: 128
[[ 1.54784355e+01 -3.83403182e-01  2.84409904e+00 ...  1.06974645e-02
  -3.00777541e-03 -4.85436060e-02]
 [-1.83906879e+01 -2.17230940e+00 -4.60091323e-01 ...  1.22108823e-02
   3.15596606e-03  8.46820697e-03]
 [ 2.14052372e+01 -7.93650150e-01 -1.72815716e+00 ...  2.91166287e-02
  -5.22203743e-03  5.04894822e-04]
 ...
 [-1.90707092e+01  9.66009796e-01 -1.50558669e-02 ... -2.66610663e-02
  -3.36460234e-03 -2.52794903e-02]
 [-1.88711834e+01  7.73744106e-01 -8.31002831e-01 ... -4.36843745e-02
  -6.49864739e-03 -1.97809506e-02]
 [ 2.00320206e+01  2.99956465e+00 -3.77757740e+00 ...  1.18456006e-01
  -8.60503092e-02 -3.19543406e-02]]


In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
from torch import nn

# Define the paths to the train and test data files
train_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/500/train_400.csv'
test_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/500/test_100.csv'

# Define the dataset class
class CustomDataset(Dataset):
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        return {
            'text': text,
            'label': label
        }

# Define the bi-LSTM model
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        outputs, _ = self.lstm(embedded)
        outputs = self.fc(outputs[:, -1, :])
        return outputs

# Create the data loaders
train_dataset_bilstm = CustomDataset(train_data_path)
test_dataset_bilstm = CustomDataset(test_data_path)

# Prepare vocabulary
vocab = set()
for data in train_dataset_bilstm:
    words = data['text'].split()
    vocab.update(words)

# Create word-to-index mapping
word_to_idx = {word: idx+1 for idx, word in enumerate(vocab)}
word_to_idx['<pad>'] = 0

# Convert text to numerical sequences
def text_to_sequence(text):
    words = text.split()
    seq = [word_to_idx[word] for word in words if word in word_to_idx]  # Check if word is in vocabulary
    return seq

# Pad sequences to a fixed length
def pad_sequence(seq, max_length):
    if len(seq) < max_length:
        seq += [word_to_idx['<pad>']] * (max_length - len(seq))
    else:
        seq = seq[:max_length]
    return seq

# Define the collate function for data loading
def collate_fn(batch):
    texts = [data['text'] for data in batch]
    labels = [data['label'] for data in batch]
    sequences = [text_to_sequence(text) for text in texts]
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = [pad_sequence(seq, max_length) for seq in sequences]
    inputs = torch.LongTensor(padded_sequences)
    labels = torch.LongTensor(labels)
    return {
        'inputs': inputs,
        'labels': labels
    }

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set hyperparameters
input_size = len(word_to_idx)
hidden_size = 128
num_classes = 2
batch_size = 8
learning_rate = 0.001
num_epochs = 10

# Create data loaders
train_loader_bilstm = DataLoader(train_dataset_bilstm, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader_bilstm = DataLoader(test_dataset_bilstm, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Create the bi-LSTM model
model_bilstm = BiLSTM(input_size, hidden_size, num_classes).to(device)

# Define loss function and optimizer
criterion_bilstm = nn.CrossEntropyLoss()
optimizer_bilstm = torch.optim.Adam(model_bilstm.parameters(), lr=learning_rate)

# Training loop for bi-LSTM
for epoch in range(num_epochs):
    train_loss_bilstm = 0.0
    train_acc_bilstm = 0.0

    model_bilstm.train()
    for batch in train_loader_bilstm:
        inputs = batch['inputs'].to(device)
        labels = batch['labels'].to(device)

        optimizer_bilstm.zero_grad()

        outputs = model_bilstm(inputs)
        loss = criterion_bilstm(outputs, labels)

        loss.backward()
        optimizer_bilstm.step()

        train_loss_bilstm += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, dim=1)
        train_acc_bilstm += accuracy_score(labels.cpu(), preds.cpu()) * inputs.size(0)

    train_loss_bilstm = train_loss_bilstm / len(train_dataset_bilstm)
    train_acc_bilstm = train_acc_bilstm / len(train_dataset_bilstm)

    # Evaluation on the test set
    model_bilstm.eval()
    test_loss_bilstm = 0.0
    test_acc_bilstm = 0.0

    with torch.no_grad():
        for batch in test_loader_bilstm:
            inputs = batch['inputs'].to(device)
            labels = batch['labels'].to(device)

            outputs = model_bilstm(inputs)
            loss = criterion_bilstm(outputs, labels)

            test_loss_bilstm += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, dim=1)
            test_acc_bilstm += accuracy_score(labels.cpu(), preds.cpu()) * inputs.size(0)

        test_loss_bilstm = test_loss_bilstm / len(test_dataset_bilstm)
        test_acc_bilstm = test_acc_bilstm / len(test_dataset_bilstm)

    print(f'Epoch {epoch + 1}/{num_epochs}:')
    print(f'bi-LSTM - Train Loss: {train_loss_bilstm:.4f}, Train Accuracy: {train_acc_bilstm:.4f}')
    print(f'bi-LSTM - Test Loss: {test_loss_bilstm:.4f}, Test Accuracy: {test_acc_bilstm:.4f}')




Epoch 1/10:
bi-LSTM - Train Loss: 0.7222, Train Accuracy: 0.5075
bi-LSTM - Test Loss: 0.6886, Test Accuracy: 0.5400
Epoch 2/10:
bi-LSTM - Train Loss: 0.6898, Train Accuracy: 0.5650
bi-LSTM - Test Loss: 0.6958, Test Accuracy: 0.5000
Epoch 3/10:
bi-LSTM - Train Loss: 0.6771, Train Accuracy: 0.5625
bi-LSTM - Test Loss: 0.6969, Test Accuracy: 0.5000
Epoch 4/10:
bi-LSTM - Train Loss: 0.6807, Train Accuracy: 0.5250
bi-LSTM - Test Loss: 0.6839, Test Accuracy: 0.5500
Epoch 5/10:
bi-LSTM - Train Loss: 0.6613, Train Accuracy: 0.5225
bi-LSTM - Test Loss: 0.6906, Test Accuracy: 0.5700
Epoch 6/10:
bi-LSTM - Train Loss: 0.6493, Train Accuracy: 0.5425
bi-LSTM - Test Loss: 0.6923, Test Accuracy: 0.6200
Epoch 7/10:
bi-LSTM - Train Loss: 0.6406, Train Accuracy: 0.5400
bi-LSTM - Test Loss: 0.7016, Test Accuracy: 0.5100
Epoch 8/10:
bi-LSTM - Train Loss: 0.6333, Train Accuracy: 0.5725
bi-LSTM - Test Loss: 0.7071, Test Accuracy: 0.6100
Epoch 9/10:
bi-LSTM - Train Loss: 0.6385, Train Accuracy: 0.5675
bi-LSTM

In [None]:
import torch
import pandas as pd
import numpy as np
import gensim
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
from torch import nn

# Define the paths to the train and test data files
train_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/500/train_400.csv'
test_data_path = '/content/drive/MyDrive/NEEWWWWW/DATA_NEW/PC/500/test_100.csv'
word2vec_model_path = '/content/drive/MyDrive/NEEWWWWW/GoogleNews-vectors-negative300.bin'

# Define the dataset class
class CustomDataset(Dataset):
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        return {
            'text': text,
            'label': label
        }

# Define the bi-LSTM model
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        outputs, _ = self.lstm(embedded)
        outputs = self.fc(outputs[:, -1, :])
        return outputs

# Create the data loaders
train_dataset = CustomDataset(train_data_path)
test_dataset = CustomDataset(test_data_path)

# Prepare vocabulary
vocab = set()
for data in train_dataset:
    words = data['text'].split()
    vocab.update(words)

# Create word-to-index mapping
word_to_idx = {word: idx+1 for idx, word in enumerate(vocab)}
word_to_idx['<pad>'] = 0

# Convert text to numerical sequences
def text_to_sequence(text):
    words = text.split()
    seq = [word_to_idx[word] for word in words if word in word_to_idx]  # Check if word is in vocabulary
    return seq

# Pad sequences to a fixed length
def pad_sequence(seq, max_length):
    if len(seq) < max_length:
        seq += [word_to_idx['<pad>']] * (max_length - len(seq))
    else:
        seq = seq[:max_length]
    return seq

# Define the collate function for data loading
def collate_fn(batch):
    texts = [data['text'] for data in batch]
    labels = [data['label'] for data in batch]
    sequences = [text_to_sequence(text) for text in texts]
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = [pad_sequence(seq, max_length) for seq in sequences]
    inputs = torch.LongTensor(padded_sequences)
    labels = torch.LongTensor(labels)
    return {
        'inputs': inputs,
        'labels': labels
    }

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set hyperparameters
input_size = len(word_to_idx)
hidden_size = 300
num_classes = 2
batch_size = 8
learning_rate = 0.001
num_epochs = 10

# Load Word2Vec model
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)
word2vec_embeddings = word2vec_model.vectors

# Create the bi-LSTM model with Word2Vec embeddings
class Word2VecBiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, embeddings):
        super(Word2VecBiLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=False)
        self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        outputs, _ = self.lstm(embedded)
        outputs = self.fc(outputs[:, -1, :])
        return outputs

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Create the Word2Vec + BiLSTM model
model = Word2VecBiLSTM(input_size, hidden_size, num_classes, word2vec_embeddings).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    train_loss = 0.0
    train_acc = 0.0

    model.train()
    for batch in train_loader:
        inputs = batch['inputs'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, dim=1)
        train_acc += accuracy_score(labels.cpu(), preds.cpu()) * inputs.size(0)

    train_loss = train_loss / len(train_dataset)
    train_acc = train_acc / len(train_dataset)

    # Evaluation on the test set
    model.eval()
    test_loss = 0.0
    test_acc = 0.0

    with torch.no_grad():
        for batch in test_loader:
            inputs = batch['inputs'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            test_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, dim=1)
            test_acc += accuracy_score(labels.cpu(), preds.cpu()) * inputs.size(0)

        test_loss = test_loss / len(test_dataset)
        test_acc = test_acc / len(test_dataset)

    print(f'Epoch {epoch + 1}/{num_epochs}:')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}')
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')

# Perform PCA dimensionality reduction on the BiLSTM embeddings
train_embeddings = []
test_embeddings = []

model.eval()
with torch.no_grad():
    for batch in train_loader:
        inputs = batch['inputs'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(inputs)
        train_embeddings.append(outputs.cpu().numpy())

    for batch in test_loader:
        inputs = batch['inputs'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(inputs)
        test_embeddings.append(outputs.cpu().numpy())

train_embeddings = np.concatenate(train_embeddings)
test_embeddings = np.concatenate(test_embeddings)

# Perform PCA dimensionality reduction
pca = PCA(n_components=2)
train_pca = pca.fit_transform(train_embeddings)
test_pca = pca.transform(test_embeddings)

# Print the shape of the PCA embeddings
print("Train PCA shape:", train_pca.shape)
print("Test PCA shape:", test_pca.shape)


AttributeError: ignored

In [None]:
# Second Channel - Word2Vec

# Define the path to the pretrained Word2Vec model
word2vec_model_path = '/content/drive/MyDrive/NEEWWWWW/GoogleNews-vectors-negative300.bin'

# Load the Word2Vec model
model_word2vec = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

# Define the dataset class for Word2Vec
class Word2VecDataset(Dataset):
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        return {
            'text': text,
            'label': label
        }

# Define the Word2Vec feature extractor
def extract_word2vec_features(text):
    words = text.split()
    feature_vec = np.zeros(300, dtype=np.float32)
    n_words = 0
    for word in words:
        if word in model_word2vec:
            feature_vec += model_word2vec[word]
            n_words += 1
    if n_words > 0:
        feature_vec /= n_words
    return feature_vec

# Define the Word2Vec dataset
train_dataset_word2vec = Word2VecDataset(train_data_path)
test_dataset_word2vec = Word2VecDataset(test_data_path)

# Extract Word2Vec features
train_features_word2vec = np.array([extract_word2vec_features(data['text']) for data in train_dataset_word2vec])
test_features_word2vec = np.array([extract_word2vec_features(data['text']) for data in test_dataset_word2vec])

# Perform PCA dimensionality reduction on Word2Vec features
pca_word2vec = PCA(n_components=128)
train_features_reduced_word2vec = pca_word2vec.fit_transform(train_features_word2vec)

print(f"Word2Vec - Original dimension: {train_features_word2vec.shape[1]}")
print(f"Word2Vec - Reduced dimension: {train_features_reduced_word2vec.shape[1]}")


In [None]:
# Final Prediction

# Check if dimensions match
if train_embeddings_reduced_bert.shape[1] != train_features_reduced_word2vec.shape[1]:
    raise ValueError("Dimensions of reduced features from both channels do not match.")

# Convert reduced features to tensors
train_features_reduced_word2vec_tensor = torch.from_numpy(train_features_reduced_word2vec).float().to(device)
train_embeddings_reduced_bert_tensor = torch.from_numpy(train_embeddings_reduced_bert).float().to(device)

# Define the final classification model
model_final = torch.nn.Sequential(
    torch.nn.Linear(train_features_reduced_word2vec.shape[1], 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, 2)
).to(device)

optimizer_final = torch.optim.Adam(model_final.parameters(), lr=0.001)
criterion_final = torch.nn.CrossEntropyLoss()

# Convert labels to tensors
train_labels_tensor = torch.from_numpy(np.array(train_dataset_bert.data['label'])).long().to(device)

# Training loop for final model
for epoch in range(10):
    optimizer_final.zero_grad()

    outputs = model_final(train_features_reduced_word2vec_tensor)
    preds = torch.argmax(outputs, dim=1)

    loss = criterion_final(outputs, train_labels_tensor)
    acc = accuracy_score(train_labels_tensor.cpu(), preds.cpu())

    loss.backward()
    optimizer_final.step()

    print(f'Epoch {epoch + 1}/{10}:')
    print(f'Final Model - Train Loss: {loss.item():.4f}, Train Accuracy: {acc:.4f}')

# Evaluation on the test set
model_final.eval()
test_features_reduced_word2vec_tensor = torch.from_numpy(test_features_reduced_word2vec).float().to(device)
with torch.no_grad():
    test_outputs = model_final(test_features_reduced_word2vec_tensor)
    test_preds = torch.argmax(test_outputs, dim=1)
    test_loss_final = criterion_final(test_outputs, test_dataset_bert.data['label'].to(device))
    test_acc_final = accuracy_score(test_dataset_bert.data['label'], test_preds.cpu())

print(f'Final Model - Test Loss: {test_loss_final.item():.4f}, Test Accuracy: {test_acc_final:.4f}')