In [5]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from torchtext.data.utils import get_tokenizer
import random
import time
import math

# Read Data

In [2]:
file_path_1 = 'data/domain1_train.json'
df1 = pd.read_json(file_path_1, lines=True)


file_path_2 = 'data/domain2_train.json'
df2 = pd.read_json(file_path_2, lines=True).drop('model', axis=1)


df_comb = pd.concat([df1, df2],axis=0,ignore_index=True)

df_comb

Unnamed: 0,text,label
0,"[70, 746, 825, 109, 2083, 0, 2, 0, 0, 0, 9, 0,...",1
1,"[1209, 179, 1952, 4, 4959, 7, 0, 2, 978, 1522,...",1
2,"[287, 3, 3330, 0, 23, 12, 13, 465, 74, 8, 0, 8...",1
3,"[0, 0, 3, 592, 19, 2, 706, 1439, 2575, 7, 2, 0...",1
4,"[9, 2, 110, 12, 42, 32, 44, 361, 9, 3860, 2358...",1
...,...,...
34395,"[175, 1317, 38, 754, 9, 5, 0, 228, 1, 45, 6, 2...",0
34396,"[466, 5, 70, 1242, 6, 3888, 1, 34, 43, 5, 70, ...",0
34397,"[10, 0, 21, 1650, 18, 5, 1335, 1, 208, 5, 997,...",0
34398,"[18, 39, 316, 133, 365, 2019, 1, 27, 10, 5, 61...",0


# Prepare Data to tensor

In [76]:
X = df_comb['text'].to_list()
X = [" ".join(map(str, x)) for x in X]
# Define a tokenizer (you can choose the tokenizer that suits your data)
tokenizer = get_tokenizer('basic_english')
X = [tokenizer(text) for text in X]

# Define the maximum sequence length you want to use
max_seq_length = 50  # Adjust this based on your dataset and model requirements

# Pad sequences to the maximum length
X = [seq[:max_seq_length] + ['9999'] * (max_seq_length - len(seq)) if len(seq) < max_seq_length else seq[:max_seq_length] for seq in X]
X = [[int(word) for word in sentence] for sentence in X]
y =  df_comb['label'].to_list() # Corresponding labels (1 for human, 0 for machine)

# Convert data to PyTorch tensors

X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y, dtype=torch.float)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DataLoader for batch processing
batch_size = 1  # Adjust this based on your dataset and model requirements
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

# RNN

In [80]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [78]:
# Hyper-parameters 
# input_size = 784 # 28x28
num_classes = 2
num_epochs = 5
batch_size = 10
learning_rate = 0.005

input_size = 50
sequence_length = 28
hidden_size = 50
num_layers = 2

In [83]:
# Fully connected neural network with one hidden layer
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        # -> x needs to be: (batch_size, seq, input_size)
        
        # or:
        #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # Set initial hidden states (and cell states for LSTM)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        #c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        
        # x: (n, 28, 28), h0: (2, n, 128)
        
        # Forward propagate RNN
        out, _ = self.rnn(x, h0)  
        # or:
        #out, _ = self.lstm(x, (h0,c0))  
        
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        # out: (n, 28, 128)
        
        # Decode the hidden state of the last time step
        out = out[:, -1, :]
        # out: (n, 128)
         
        out = self.fc(out)
        # out: (n, 10)
        return out

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [N, 1, 28, 28]
        # resized: [N, 28, 28]
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')

RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor

In [73]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size) # input is X and h so have a size of input_size + hidden_size
        self.h2o = nn.Linear(hidden_size, output_size) # changed input, output size is the number of classes
        self.activation = nn.Tanh() # new
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.activation(self.i2h(combined)) # changed to use activation. This is using linear activation function above combined have a input size of input_size + hidden_size
        output = self.h2o(hidden) # changed input
        output = self.softmax(output) # we only use the output of the last word to make prediction despite we have output for every word. For extra long sentence, we loose the info at the front
        return output, hidden
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [74]:
n_iters = 80000
print_every = 5000
plot_every = 1000
noise_level = 0 # change this line (as discussed later)
n_hidden = 50
learning_rate = 0.005
n_letters = max_seq_length
n_categories = 2
n_epoch = 5

In [75]:
rnn = RNNClassifier(n_letters, n_hidden, n_categories)
criterion = nn.NLLLoss()
optimizer = optim.SGD(rnn.parameters(), lr=learning_rate)


# number of epoch is set by experience
for epoch in range(n_epoch):  # Loop over training dataset `n_epochs` times

    for i, data in enumerate(train_loader):  # Loop over elements in training set
        
        x, labels = data

        # print(x.shape)
        # x = x.squeeze(1) # Flatten images but keep batch dimension

        # fit one linear regression for each class and we pick the highest p~ as our prediction

        for i in range(x.size()[0]):
            new_shape = x[i].unsqueeze(0)
            prediction, hidden = rnn(new_shape,  rnn.initHidden())
        print(prediction)
        print(labels)
        break
        # criterion is pre-set above as loss function
        loss = criterion(prediction, target=labels)
    
        loss.backward()               # Backward pass (compute parameter gradients)
        optimizer.step()              # Update weight parameter using SGD
        optimizer.zero_grad()         # Reset gradients to zero for next iteration
            # Add parameters' gradients to their values, multiplied by learning rate
        for p in rnn.parameters():
            p.data.add_(p.grad.data, alpha=-learning_rate)

tensor([[-0.7600, -0.6305]], grad_fn=<LogSoftmaxBackward0>)
tensor([1.])
tensor([[-1.3956, -0.2846]], grad_fn=<LogSoftmaxBackward0>)
tensor([0.])
tensor([[-1.0256, -0.4441]], grad_fn=<LogSoftmaxBackward0>)
tensor([0.])
tensor([[-0.6926, -0.6937]], grad_fn=<LogSoftmaxBackward0>)
tensor([0.])
tensor([[-0.7295, -0.6581]], grad_fn=<LogSoftmaxBackward0>)
tensor([1.])


In [45]:
rnn

RNNClassifier(
  (i2h): Linear(in_features=100, out_features=50, bias=True)
  (h2o): Linear(in_features=50, out_features=2, bias=True)
  (activation): Tanh()
  (softmax): LogSoftmax(dim=1)
)

In [None]:
class AttentionalGRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(AttentionalGRUClassifier, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size) 
        self.att = nn.Linear(hidden_size, 1) 
        
    def forward(self, input_sequence):
        # process the input sequence into a sequence of RNN hidden states
        states, _ = self.gru(input_sequence)
        # compute attention scores to each RNN hidden state (we use a linear function)
        att_scores = self.att(states)
        # rescale the attention scores using a softmax, so they sum to one
        alpha = F.softmax(att_scores, dim=0)
        # compute the "c" vector as a weighted combination of the RNN hidden states
        c = torch.sum(torch.mul(states, alpha), dim=0)
        # now couple up the c state to the output, and compute log-softmax
        output = self.h2o(c.view(1, -1)) 
        output = F.log_softmax(output, dim=1)
        return output, alpha

In [None]:
model = AttentionalGRUClassifier(n_letters, n_hidden, n_categories)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

start = time.time()
all_losses_att = []
current_loss = 0

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample(noise=noise_level)

    model.zero_grad()
    output, _ = model.forward(line_tensor)
    output = torch.squeeze(output, 1) # remove redundant dimension
    loss = criterion(output, category_tensor)
    current_loss += loss.item()
    loss.backward()
    optimizer.step()

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses_att.append(current_loss / plot_every)
        current_loss = 0

In [11]:
# Define the RNN model
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout):
        super(TextRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1) if self.rnn.bidirectional else hidden[-1, :, :])
        return self.fc(hidden)

In [20]:
?TextRNN

[1;31mInit signature:[0m
[0mTextRNN[0m[1;33m([0m[1;33m
[0m    [0mvocab_size[0m[1;33m,[0m[1;33m
[0m    [0membedding_dim[0m[1;33m,[0m[1;33m
[0m    [0mhidden_dim[0m[1;33m,[0m[1;33m
[0m    [0moutput_dim[0m[1;33m,[0m[1;33m
[0m    [0mnum_layers[0m[1;33m,[0m[1;33m
[0m    [0mbidirectional[0m[1;33m,[0m[1;33m
[0m    [0mdropout[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in
a tree structure. You can assign the submodules as regular attributes::

    import torch.nn as nn
    import torch.nn.functional as F

    class Model(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Conv2d(1, 20, 5)
            self.conv2 = nn.Conv2d(20, 20, 5)

        def forward(self, x):
            x = F.relu(self.conv1(x))
     

In [12]:
# Initialize the model
vocab_size = 5000
embedding_dim = 128
hidden_dim = 64
output_dim = 1
num_layers = 2
bidirectional = True
dropout = 0.5

model = TextRNN(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout)

In [35]:
# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

# Training loop
def train(model, iterator, optimizer, criterion):
    model.train()
    for feature, label in iterator:
        optimizer.zero_grad()
        predictions = model(feature).squeeze(1)
        loss = criterion(predictions, label)
        loss.backward()
        optimizer.step()

In [34]:
for feature, label in train_loader:
    print(feature)
    print(label)
    break

tensor([[   0,   13,   22,  ..., 9999, 9999, 9999],
        [4319, 3468,    2,  ..., 9999, 9999, 9999],
        [2344,   93,    3,  ...,   15,    1,    0],
        ...,
        [   2,  107,    1,  ...,  350,   32,   23],
        [   5,  921, 3048,  ...,    2,    0, 1402],
        [  10, 1682,   61,  ..., 9999, 9999, 9999]])
tensor([1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 1., 1., 0., 0., 0., 0., 0., 0.])


In [37]:
# Training the model
N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    train(model, train_loader, optimizer, criterion)

IndexError: index out of range in self

In [None]:
# Evaluate the model
model.eval()
with torch.no_grad():
    predictions = model(X_test).squeeze(1)
    rounded_predictions = torch.round(torch.sigmoid(predictions))
    accuracy = torch.sum(rounded_predictions == y_test).item() / len(y_test)

print(f'Test Accuracy: {accuracy * 100:.2f}%')