In [4]:
from dataProcessing import *
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn.functional import binary_cross_entropy_with_logits as bce_loss
from torch.optim import Adam
from sklearn.model_selection import train_test_split

class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.delta = delta
        self.early_stop = False

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), 'checkpoint.pt')
        print(f'Validation loss decreased ({-self.best_score:.6f} --> {val_loss:.6f}).  Saving model ...')

class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, bidirectional, dropout):
        super(Classifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers,
                            bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2 if bidirectional else hidden_dim, 1)

    def forward(self, x):
        x = x.unsqueeze(1) # Add sequence length dimension
        lstm_out, (hidden, cell) = self.lstm(x)
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        else:
            hidden = hidden[-1,:,:]
        output = self.fc(hidden).squeeze()
        return output

batch_size = 32
train_texts = np.load('../p_data/train_texts.npy', allow_pickle=True)
train_labels = np.load('../p_data/train_labels.npy', allow_pickle=True)
test_texts = np.load('../p_data/test_texts.npy', allow_pickle=True)
test_labels = np.load('../p_data/test_labels.npy', allow_pickle=True)

w2v_model = w2v_train(train_texts)

# Convert texts to vectors
train_data = [text_to_vec(text, w2v_model) for text in train_texts]
test_data = [text_to_vec(text, w2v_model) for text in test_texts]

# Divide the training set and validation set
train_data, val_data, train_labels, val_labels = train_test_split(
    train_data, train_labels, test_size=0.2, random_state=42)

# Convert lists to tensors
train_data = [torch.tensor(vec) for vec in train_data]
val_data = [torch.tensor(vec) for vec in val_data]
test_data = [torch.tensor(vec) for vec in test_data]

# Create DataLoader
train_loader = DataLoader(VectorDataset(train_data, train_labels), batch_size)
val_loader = DataLoader(VectorDataset(val_data, val_labels), batch_size)
test_loader = DataLoader(VectorDataset(test_data, test_labels), batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
early_stopping = EarlyStopping(patience=5, delta=0.01)
hidden_dim = 128
num_layers = 2
bidirectional = True
dropout = 0.5
model = Classifier(input_dim=100, hidden_dim=hidden_dim, num_layers=num_layers,
                   bidirectional=bidirectional, dropout=dropout).to(device)
optimizer = Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(50):
    total_loss = 0
    total_count = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs.float())
        loss = bce_loss(outputs, targets.float())
        total_loss += loss.item() * inputs.size(0)
        total_count += inputs.size(0)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / total_count
    print(f'Epoch {epoch+1}, Loss: {avg_loss}')

    model.eval()
    val_loss = 0
    val_count = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs.float())
            loss = bce_loss(outputs, targets.float())
            val_loss += loss.item() * inputs.size(0)
            val_count += inputs.size(0)
    val_loss /= val_count

    early_stopping(val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping")
        break

model.load_state_dict(torch.load('checkpoint.pt'))

model.eval()
total_loss = 0
total_count = 0
correct_count = 0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs.float())
        loss = bce_loss(outputs, targets.float())
        total_loss += loss.item() * inputs.size(0)
        total_count += inputs.size(0)
        pred = (outputs > 0.5).long()
        correct_count += (pred == targets).sum().item()
avg_loss = total_loss / total_count
accuracy = correct_count / total_count
print(f'Test Loss: {avg_loss}, Accuracy: {accuracy}')

Epoch 1, Loss: 0.5077714228630066
Validation loss decreased (0.460342 --> 0.460342).  Saving model ...
Epoch 2, Loss: 0.45646918864250186
Validation loss decreased (0.449592 --> 0.449592).  Saving model ...
Epoch 3, Loss: 0.4494832589387894
EarlyStopping counter: 1 out of 5
Epoch 4, Loss: 0.4444856627225876
EarlyStopping counter: 2 out of 5
Epoch 5, Loss: 0.4404243407249451
Validation loss decreased (0.436847 --> 0.436847).  Saving model ...
Epoch 6, Loss: 0.4370722928762436
EarlyStopping counter: 1 out of 5
Epoch 7, Loss: 0.4341897609949112
EarlyStopping counter: 2 out of 5
Epoch 8, Loss: 0.43155703101158144
EarlyStopping counter: 3 out of 5
Epoch 9, Loss: 0.4290436578273773
EarlyStopping counter: 4 out of 5
Epoch 10, Loss: 0.4265789274692535
EarlyStopping counter: 5 out of 5
Early stopping
Test Loss: 0.4347554003953934, Accuracy: 0.77732


In [6]:
from sklearn.preprocessing import LabelEncoder
from dataProcessing_t import *
from torch.nn.utils.rnn import pad_sequence

batch_size = 128
train_texts = np.load('../p_data_3/train_texts.npy', allow_pickle=True)
train_labels = np.load('../p_data_3/train_labels.npy', allow_pickle=True)
test_texts = np.load('../p_data_3/test_texts.npy', allow_pickle=True)
test_labels = np.load('../p_data_3/test_labels.npy', allow_pickle=True)
val_texts = np.load('../p_data_3/val_texts.npy', allow_pickle=True)
val_labels = np.load('../p_data_3/val_labels.npy', allow_pickle=True)

# Create a Word2Vec model
w2v_model = Word2Vec(train_texts, min_count=1, vector_size=100)

def get_word_vector(word, w2v_model, default_vector):
    try:
        return w2v_model.wv[word] if word in w2v_model.wv else default_vector
    except KeyError:
        return default_vector

# Padding function
def pad_texts(texts):
    return pad_sequence([torch.tensor([get_word_vector(word, w2v_model, default_vector) for word in text]) for text in texts], batch_first=True)


# Calculate the default vector
default_vector = np.mean(w2v_model.wv.vectors, axis=0)

# Convert the texts to vectors with padding
train_data = pad_texts(train_texts)
val_data = pad_texts(val_texts)
test_data = pad_texts(test_texts)

# Encode the labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

# Convert lists to tensors
train_data = [torch.tensor(vec) for vec in train_data]
val_data = [torch.tensor(vec) for vec in val_data]
test_data = [torch.tensor(vec) for vec in test_data]
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)


# Create DataLoader
train_loader = DataLoader(list(zip(train_data, train_labels)), batch_size)
val_loader = DataLoader(list(zip(val_data, val_labels)), batch_size)
test_loader = DataLoader(list(zip(test_data, test_labels)), batch_size)


# Define the LSTM Classifier
class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(BiLSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)

        self.fc = nn.Linear(2*hidden_dim, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        # Split the final state into the forward and backward parts
        forward_final = lstm_out[:, -1, :self.hidden_dim]
        backward_final = lstm_out[:, 0, self.hidden_dim:]
        # Concatenate the final states and pass through the linear layer
        out = self.fc(torch.cat((forward_final, backward_final), dim=1)).squeeze()
        return out

# Define your LSTM model
embedding_dim = 100  # This should match the dimension of your word2vec vectors
hidden_dim = 100
model = BiLSTMClassifier(embedding_dim, hidden_dim).to(device)
early_stopping = EarlyStopping(patience=5, delta=0.01)

# Define loss function
loss_function = nn.BCEWithLogitsLoss()

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training
model.train()
for epoch in range(50):
    total_loss = 0
    total_count = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device).float()
        model.zero_grad()
        outputs = model(inputs.float())
        loss = loss_function(outputs, targets)
        total_loss += loss.item() * inputs.size(0)
        total_count += inputs.size(0)
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / total_count
    print(f'Epoch {epoch+1}, Loss: {avg_loss}')

    model.eval()
    val_loss = 0
    val_count = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device).float()
            outputs = model(inputs.float())
            loss = loss_function(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
            val_count += inputs.size(0)
    val_loss /= val_count

    # Update the early stopping object
    early_stopping(val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping")
        break

# Load the best model
model.load_state_dict(torch.load('checkpoint.pt'))

# Evaluation
model.eval()
total_loss = 0
total_count = 0
correct_count = 0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device).float()
        outputs = model(inputs.float())
        loss = loss_function(outputs, targets)
        total_loss += loss.item() * inputs.size(0)
        total_count += inputs.size(0)
        pred = (torch.sigmoid(outputs) > 0.5).long()
        correct_count += (pred == targets.long()).sum().item()
avg_loss = total_loss / total_count
accuracy = correct_count / total_count
print(f'Test Loss: {avg_loss}, Accuracy: {accuracy}')


  train_data = [torch.tensor(vec) for vec in train_data]
  val_data = [torch.tensor(vec) for vec in val_data]
  test_data = [torch.tensor(vec) for vec in test_data]


Epoch 1, Loss: 0.6055511006333696
Validation loss decreased (0.552904 --> 0.552904).  Saving model ...
Epoch 2, Loss: 0.5474187473967892
EarlyStopping counter: 1 out of 5
Epoch 3, Loss: 0.5299750655819716
Validation loss decreased (0.500112 --> 0.500112).  Saving model ...
Epoch 4, Loss: 0.5071744214827711
EarlyStopping counter: 1 out of 5
Epoch 5, Loss: 0.49003544345114214
EarlyStopping counter: 2 out of 5
Epoch 6, Loss: 0.44576130850439794
Validation loss decreased (0.398613 --> 0.398613).  Saving model ...
Epoch 7, Loss: 0.35255462219713257
Validation loss decreased (0.292701 --> 0.292701).  Saving model ...
Epoch 8, Loss: 0.2710202895599264
Validation loss decreased (0.234735 --> 0.234735).  Saving model ...
Epoch 9, Loss: 0.175788465465632
Validation loss decreased (0.167042 --> 0.167042).  Saving model ...
Epoch 10, Loss: 0.13381583510573072
EarlyStopping counter: 1 out of 5
Epoch 11, Loss: 0.05499133537468921
Validation loss decreased (-0.126494 --> -0.126494).  Saving model ...