In [1]:
import numpy as np
import pandas as pd
import os
import csv
import math
import torch
import random
from tqdm.auto import tqdm
from copy import deepcopy
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [2]:
# Dataset from https://www.kaggle.com/datasets/czzzzzzz/scp1to7
scp_df = pd.read_csv('/kaggle/input/scp1to7/scp6999.csv', header=0, delimiter=',', quoting=csv.QUOTE_ALL, encoding='utf-8', index_col=False, usecols=['code', 'title', 'text', 'image captions', 'rating', 'state', 'tags',  'link'])

In [3]:
unified_prompt = "This is an SCP-Foundation fiction:\nTitle: "

In [4]:
def preprocess(text, title):
    text = unified_prompt + title[1:-1] + '\n' + text[1:-1].replace(' \n ', '\n')
    index = text.find("« SCP-")
    if index != -1:
        result = text[:index]
        return result
    else:
        return text

In [5]:
def set_seed(seed):
    # Seed function by Saurav Maheshkar
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

random_seed=2333
set_seed(random_seed)

Random seed set as 2333


In [6]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using Tesla P100-PCIE-16GB


In [7]:
print(preprocess(scp_df['text'][7], scp_df['title'][7]))

This is an SCP-Foundation fiction:
Title: Zombie Plague
Item #: SCP-008
Object Class: Euclid
Special Containment Procedures: SCP-008 samples have been deemed Class V extreme biological hazards, and all related protocols apply. Incineration and irradiation measures will be deployed in the event of political or military action which may result in the facility being dismantled; a power failure; or zero communications from operatives or outside channels during any given eight hour period.
The quarantine period for operatives leaving the facility is four months. If a breach has occurred, incineration and irradiation measures shall be deployed. It should be the policy of all G2 sites to not prepare an evacuation procedure.
Description: SCP-008 is a complex prion, samples of which are stored in each of the known G2 sites. Research into SCP-008 is highly classified and primarily aimed at preventing research which may lead to the synthesis of SCP-008 in the distant future. Traits of the SCP-008

In [8]:
text_list = scp_df['text'].tolist()
title_list = scp_df['title'].tolist()
whole_list = [preprocess(text, title) for text, title in zip(text_list, title_list)]
random.shuffle(whole_list)
train_list = whole_list[:6000]
valid_list = whole_list[6000:]

In [9]:
def calculate_perplexity(dataloader, model, device):
    model.eval()
    total_loss = 0
    total_count = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item() * input_ids.size(0)
            total_count += input_ids.size(0)

    perplexity = torch.exp(torch.tensor(total_loss / total_count)).item()
    return perplexity

In [10]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=64):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer("summarize: " + text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask}

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Set padding token (GPT2 does not have a default one)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = TextDataset(train_list, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataset = TextDataset(valid_list, tokenizer)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

epochs = 3
lowest_perplexity = float('inf')
best_model = None

model.train()

valid_perplexity = calculate_perplexity(valid_dataloader, model, device)
print(f"Validation perplexity before training: {valid_perplexity}")

for epoch in range(epochs):
    for batch in tqdm(train_dataloader, desc=f"Training epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    scheduler.step()
    
    # model.eval() is done in the calculate_perplexity() method
    valid_perplexity = calculate_perplexity(valid_dataloader, model, device)
    print(f"Validation perplexity after epoch {epoch+1}: {valid_perplexity}")
    if valid_perplexity < lowest_perplexity:
        lowest_perplexity = valid_perplexity
        best_model = deepcopy(model)
        print("Best_model updated")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Validation perplexity before training: 38.926368713378906


Training epoch 1:   0%|          | 0/375 [00:00<?, ?it/s]

Validation perplexity after epoch 1: 5.3264970779418945
Best_model updated


Training epoch 2:   0%|          | 0/375 [00:00<?, ?it/s]

Validation perplexity after epoch 2: 5.311081409454346
Best_model updated


Training epoch 3:   0%|          | 0/375 [00:00<?, ?it/s]

Validation perplexity after epoch 3: 5.35799503326416


In [11]:
input_text = unified_prompt + "The angry linguist \n"
inputs = tokenizer(input_text, return_tensors='pt').to(device)
generated = best_model.generate(inputs['input_ids'], max_length=512, do_sample=True, top_p=0.9, pad_token_id=tokenizer.eos_token_id)
sample_output = tokenizer.decode(generated[0], skip_special_tokens=True)
print(sample_output)

This is an SCP-Foundation fiction:
Title: The angry linguist 
Item #: SCP-2666
Object Class: Keter
Special Containment Procedures: Due to the nature of the SCP-2666 phenomenon, containment is focused on its suppression. Mobile Task Force Gamma-5 ("Aquarian") is to be embedded within all Foundation units in order to effectively suppress any known SCP-2666 events.
Description: SCP-2666 is a humanoid manifestation of the mammalian pterodactyl. Its origins are unknown, and currently are unknown at this time, though a Foundation cover story has been successfully employed to conceal it. Any Foundation personnel with information regarding SCP-2666 instances are to be detained for questioning within the organization.
Current Containment Procedures: SCP-2666 has been assigned a minimum of ten civilians, to assist in suppression of its manifestation. The population of SCP-2666 has been modified to prevent civilian access to its habitat. Foundation personnel are to constantly search for instances

In [12]:
input_text = unified_prompt
inputs = tokenizer(input_text, return_tensors='pt').to(device)
generated = best_model.generate(inputs['input_ids'], max_length=512, do_sample=True, top_p=0.9, pad_token_id=tokenizer.eos_token_id)
sample_output = tokenizer.decode(generated[0], skip_special_tokens=True)
print(sample_output)

This is an SCP-Foundation fiction:
Title: 【FURY】
SCP-5726 -
You're reading RE: The Storyteller's Diner
This is my first rework and it's because I found myself writing it once. This is my first rework and it's because I found myself writing it once.
More by this author
Item #: SCP-5726
Object Class: Euclid
Special Containment Procedures: SCP-5726 is stored in a low-power anomalous item locker at Site-73. SCP-5726 has been placed on display at all times and is constantly monitored by SCP-5726-A-1. A recording of SCP-5726-A-1's behavior is to be kept on file and recorded whenever SCP-5726 is found to be exhibiting any sign of a mental breakdown or signs of being a threat to the Foundation. SCP-5726 is to be fed twice daily. In the event that SCP-5726-A-1 does not show signs of a mental breakdown, a copy of this documentation is to be kept on file.
Description: SCP-5726 is a humanoid entity of a normal height and height, measuring approximately 6'1" and measuring 5'5". SCP-5726-A's body co