In [1]:
!pip install transformers
!pip install pandas
!pip install torch



In [6]:
import pandas as pd
import base64

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Decode the Base64 encoded sentences
train_data['decoded_text'] = train_data['sentence'].astype(str).apply(lambda x: base64.b64decode(x+"==").decode('latin-1'))
test_data['decoded_text'] = test_data['sentence'].astype(str).apply(lambda x: base64.b64decode(x+"==").decode('latin-1'))

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)

# Tokenize the dataset
train_encodings = tokenizer(train_data['decoded_text'].tolist(), truncation=True, padding=True)

# Create a PyTorch dataset from the encodings and temperatures
import torch

class TempDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, temperatures):
        self.encodings = encodings
        self.temperatures = temperatures

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.temperatures[idx])
        return item

    def __len__(self):
        return len(self.temperatures)

train_dataset = TempDataset(train_encodings, train_data['temperature'].tolist())

# Set up the Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()


2023-04-29 15:58:40.995504: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-29 15:58:43.200531: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Step,Training Loss
500,0.0214
1000,0.0147
1500,0.0138
2000,0.0134
2500,0.0127
3000,0.0125
3500,0.0109
4000,0.01
4500,0.0098
5000,0.0096


TrainOutput(global_step=9600, training_loss=0.010551981410632532, metrics={'train_runtime': 1806.8496, 'train_samples_per_second': 170.02, 'train_steps_per_second': 5.313, 'total_flos': 2.0267150552064e+16, 'train_loss': 0.010551981410632532, 'epoch': 3.0})

In [8]:
test_encodings = tokenizer(test_data['decoded_text'].tolist(), truncation=True, padding=True)
test_dataset = TempDataset(test_encodings, [-1]*len(test_data))

# Predict temperatures for the test dataset
predictions = trainer.predict(test_dataset)
test_data['temperature'] = predictions.predictions.flatten()


In [10]:
submission = test_data[['id', 'temperature']]
from datetime import datetime
outFile = f'submission_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv'
submission.to_csv(outFile, index=False)

def upload_CTFSG(token, grader, file):
    import urllib.request, os, json
    urllib.request.urlretrieve('https://raw.githubusercontent.com/alttablabs/ctfsg-utils/master/pyctfsglib.py', './pyctfsglib.py')
    print('Downloaded pyctfsglib.py:', 'pyctfsglib.py' in os.listdir())
    import pyctfsglib as ctfsg
    grader = ctfsg.DSGraderClient(grader, token)
    response = json.loads(grader.submitFile(file))
    os.rename(file, f'{response["multiplier"]}_sklearn_{file[:-4]}.csv')
    return response

import random
GRADER_URL = random.choice([
  "http://chals.f.cyberthon23.ctf.sg:42081/",
  "http://chals.f.cyberthon23.ctf.sg:42082/"
])
token = "XQwqczVjRbNLIQbRNlsPvntYEeYqLuXwjWbhnLIKRpIJUjlfxsYmYglKFnFAeaOp"

print(upload_CTFSG(token, GRADER_URL, outFile))

Downloaded pyctfsglib.py: True
DSGraderClient: Successfully Connected!
[SERVER] MOTD: CHECK your USER_TOKEN and GRADER_URL HTTP address! I'm LLM Heat @ds-llm-alpha-5c8999ff97-kfp2s
ProofOfWork Challenge =>  ('CTFSGRB737e02ab1df78ec77a81749fcb42524e', 22)
ProofOfWork Answer Found! =>  14062484
{'challenge': {'name': 'Getting Hot'}, 'id': 'clh1qdfywjn190903j62rdoyf', 'status': 'PARTIALLY_CORRECT', 'multiplier': 0.8895, 'submittedBy': {'username': 'wakatta'}, 'createdAt': '2023-04-29T08:38:29Z'}
