In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch import nn
from torch import cos, sin, arccos, mean, square, sqrt, arcsin
import torch
from transformers import Trainer
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
import random
from datetime import datetime as dt
from utils import 

## MODEL
BASE_MODEL = '2021-11-12_model-distilroberta-base_loss-huber_epoch-4'
TOKEN_MODEL = 'distilroberta-base'
MAX_SEQ_LENGTH = 300
NUM_LABELS = 2
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 4
LOSS = 'huber'
DATE = str(dt.now().date())
LOGSTR = f"{DATE}_model-{TOKEN_MODEL}_loss-{LOSS}"


data = pd.read_parquet('gs://geobert/data/geo_data.parquet')
data = data.dropna().drop_duplicates('text')
texts = data["text"].str.replace('\n', '').values.tolist()
labels = data[['lat', 'lon']].values.astype(float).tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=.2)


tokenizer = RobertaTokenizerFast.from_pretrained(TOKEN_MODEL)
# tokenizer = DistilBertTokenizerFast.from_pretrained(TOKEN_MODEL)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
# test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)    

train_dataset = GEODataset(train_encodings, train_labels)
val_dataset = GEODataset(val_encodings, val_labels)


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

config = RobertaConfig()
config.num_labels = NUM_LABELS
config.max_position_embeddings = MAX_SEQ_LENGTH

model = RobertaForSequenceClassification(config).from_pretrained(BASE_MODEL)
# model = DistilBertForSequenceClassification(config).from_pretrained(BASE_MODEL)
model.to(device)


train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = list(DataLoader(val_dataset, batch_size=TEST_BATCH_SIZE, shuffle=True))

optim = AdamW(model.parameters(), lr=5e-5)

writer = SummaryWriter(log_dir="logs")
losses = []
iteration = 0
for epoch in list(range(5,15)):
    for batch in train_loader:
        model.train()
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels, output_hidden_states=True)
        logits = outputs.get('logits')
        loss_fct = nn.HuberLoss()
        train_loss = loss_fct(logits.view(-1, 2), labels.view(-1,2))
        train_loss.backward()
        optim.step()
        train_loss_float = float(train_loss) 
        del input_ids, attention_mask, labels, logits, train_loss
        
        model.eval()
        with torch.no_grad():
            val_batch = random.choice(val_loader)
            val_input_ids = val_batch['input_ids'].to(device)
            val_attention_mask = val_batch['attention_mask'].to(device)
            val_labels = val_batch['labels'].to(device)
            val_outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels, output_hidden_states=True)
            val_logits = val_outputs.get('logits')
            val_loss = loss_fct(val_logits, val_labels)
            val_loss_float = float(val_loss)
            del val_input_ids, val_attention_mask, val_labels, val_logits, val_loss
        
        losses.append([
            iteration, 
            train_loss_float, 
            val_loss_float
        ])
        print(f"TRAIN: {train_loss_float:.3f}, VAL: {val_loss_float:.3f}")
        writer.add_scalar(LOGSTR + "-train", train_loss_float, iteration)
        writer.add_scalar(LOGSTR + "-test", val_loss_float, iteration)
        iteration +=1
        
    
    model.save_pretrained(LOGSTR + f"_epoch-{epoch}")

model.eval()
model.save_pretrained(LOGSTR + f"_epoch-{epoch}")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

LOSS: 8987.1123046875
LOSS: 10395.85546875


KeyboardInterrupt: 

In [77]:
from transformers import pipeline

In [80]:
nlp = pipeline('text-classification', model=model, tokenizer=tokenizer, return_all_scores=True, function_to_apply=None)

In [81]:
nlp("Ich fahre durchs schöne Berlin")

[[{'label': 'LABEL_0', 'score': 0.7113227248191833},
  {'label': 'LABEL_1', 'score': 0.4895717203617096}]]