In [1]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from torch.utils.data import Dataset,DataLoader
import pandas
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class IMDBdataest(Dataset):
    def __init__(self,texts,labels):
        super().__init__()
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.inputs = self.tokenizer(texts,truncation=True, max_length=1024,padding=True)
        self.ids = self.inputs['input_ids']
        self.attention_masks = self.inputs['attention_mask']
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        return (torch.tensor(self.ids[index]),torch.tensor(self.attention_masks[index])),torch.tensor(self.labels[index])

In [3]:
data  = pandas.read_csv('./motionClassify.csv')
train_texts, train_labels = list([' '.join(s.split()[:1024]) for s in  data[:40000]['review']]),list(data[:40000]['label'])
test_texts, test_labels = list([' '.join(s.split()[:1024]) for s in data[40000:]['review']]),list(data[40000:]['label'])

In [4]:
batch_size = 32
train_dataset = IMDBdataest(train_texts, train_labels)
test_dataset = IMDBdataest(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size, shuffle=True,num_workers=8)
test_loader = DataLoader(test_dataset, batch_size, shuffle=False)

In [5]:
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = train_dataset.tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
torch.save(model.state_dict(),'test.pth')

In [7]:
def train(net,train_iter,device,num_epochs,lr):
    net.train()
    net.to(device)
    logs=[]
    optimzier = torch.optim.Adam(net.parameters(),lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    for epoch in range(1,num_epochs+1):
        batch = 1
        for (X,atten_mask),label in tqdm(train_iter):
            optimzier.zero_grad()
            X=X.to(device)
            atten_mask = atten_mask.to(device)
            label = label.to(device)
            y_hat = net(input_ids=X, attention_mask=atten_mask)
            loss = criterion(y_hat.logits,label)
            loss.backward()
            if batch%100==0:
                log = f'epoch{epoch},batch{batch} loss={loss.item()}'
                logs.append(log)
            optimzier.step()
    return logs



In [8]:
train(model,train_loader,torch.device('cuda:0'),num_epochs=1,lr=2e-5)

  1%|          | 14/1250 [00:17<26:13,  1.27s/it]


KeyboardInterrupt: 