In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split

from copy import deepcopy

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch.nn as nn
import torch

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")

In [None]:
main = df[["text", "label"]].copy()

In [None]:
tokenizer = get_tokenizer("basic_english")


def extract_tokens(txt):
    for text, _ in txt:
        yield tokenizer(text)
        
        
vocabular = build_vocab_from_iterator(extract_tokens(main.values), specials=["<unk>"])
vocabular.set_default_index(vocabular["<unk>"])

text_pipeline = lambda x: vocabular(tokenizer(x))

In [None]:
def collate_ds(x):
    txt, lbs, off = [], [], [0]
    for text, label in x:
        tensor_txt = torch.tensor(text_pipeline(text), dtype=torch.int64)
        txt.append(tensor_txt)
        lbs.append(label)
        off.append(tensor_txt.size(0))
        
    lbs = torch.tensor(lbs, dtype=torch.int64)
    off = torch.tensor(off[:-1]).cumsum(dim=0)
    txt = torch.cat(txt)
    return txt, lbs, off

In [None]:
train_phase, test = train_test_split(main.values, random_state=42, test_size=0.1)
train, val = train_test_split(train_phase, random_state=42, test_size=0.2)

In [None]:
EPOCHS = 20
LR = 0.1
STEP = 10
GAMMA = 0.1
BATCH = 128

In [None]:
train_dl = DataLoader(train, batch_size=BATCH, shuffle=True, collate_fn=collate_ds)
val_dl = DataLoader(val, batch_size=BATCH, shuffle=False, collate_fn=collate_ds)

In [None]:
class AIText(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes):
        super(AIText, self).__init__()
        self.embed = nn.EmbeddingBag(vocab_size, embed_size, sparse=False)
        self.layers = nn.Sequential(nn.Linear(embed_size, 256),
                                   nn.ReLU(),
                                   nn.BatchNorm1d(256),
                                   nn.Linear(256, 256),
                                   nn.ReLU(),
                                   nn.BatchNorm1d(256),
                                   nn.Linear(256, 128),
                                   nn.ReLU(),
                                   nn.Linear(128, num_classes))
        
    def forward(self, x, off):
        x = self.embed(x, off)
        x = self.layers(x)
        return nn.functional.softmax(x, dim=1)

In [None]:
vocab_size = len(vocabular)
num_classes = df["label"].nunique()
embed_size = 128

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model = AIText(vocab_size, embed_size, num_classes)
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP, gamma=GAMMA)

In [None]:
best_model = deepcopy(model)
best_acc = 0
train_history = []
val_history = []
acc_history = []

for i in range(1, EPOCHS+1):
    model.train()
    train_loss = 0.0
    train_total = 0
    for text, label, offset in train_dl:
        optimizer.zero_grad()
        if torch.cuda.is_available():
            text, label, offset = text.cuda(), label.cuda(), offset.cuda()
            
        out = model(text, offset)
        loss = criterion(out, label)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_total += out.size(0)
    train_loss = train_loss/train_total
    train_history += [train_loss]
    
    val_loss = 0.0
    val_total = 0
    acc_loss = 0
    
    model.eval()
    with torch.no_grad():
        for text, label, offset in val_dl:
            if torch.cuda.is_available():
                text, label, offset = text.cuda(), label.cuda(), offset.cuda()
            
            out = model(text, offset)
            loss = criterion(out, label)
            val_loss += loss.item()
            val_total += out.size(0)
            acc_loss += (out.argmax(1) == label).sum().item()
            
    acc_loss = acc_loss/val_total
    val_loss = val_loss/val_total
    
    if acc_loss > best_acc:
        best_model = deepcopy(best_model)
        best_acc = acc_loss
        
    val_history += [val_loss]
    acc_history += [acc_loss]
    print("Epoch {} train loss {} val loss {} accuracy {}".format(i, train_loss, val_loss, acc_loss))
    scheduler.step()

In [None]:
epochs = list(range(1, EPOCHS+1))

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10, 4))
axes[0].plot(epochs, train_history)
axes[1].plot(epochs, val_history)
axes[2].plot(epochs, acc_history)
axes[0].set_title("Train loss")
axes[1].set_title("Validation loss")
axes[2].set_title("Accuracy progression")
plt.suptitle("Training performance\nbest model is {}%".format(round(best_acc*100, 2)))
plt.tight_layout()
plt.show()

In [None]:
def predict(txt):
    processed = torch.tensor(text_pipeline(txt), dtype=torch.int64)
    offset = torch.tensor([0], dtype=torch.int64)
    best_model.eval()
    with torch.no_grad():
        if torch.cuda.is_available():
            processed, offset = processed.cuda(), offset.cuda()
        out = model(processed, offset)
        
    return out.argmax(1).item()

In [None]:
test_labels = []
predicted_labels = []
for i in range(len(test)):
    predicted_labels += [predict(test[i, 0])]
    test_labels += [test[i, 1]]

In [None]:
score = accuracy_score(test_labels, predicted_labels)
report = classification_report(test_labels, predicted_labels, target_names=["Human", "AI"])
cm = confusion_matrix(test_labels, predicted_labels)
print("Accuracy: {}%".format(round(score*100, 2)))
print(report)
sns.heatmap(cm, annot=True)
plt.show()