In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch import nn
from torch import cos, sin, arccos, mean, square, sqrt, arcsin
import torch
from transformers import Trainer
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
import random
from datetime import datetime as dt


PI = torch.acos(torch.zeros(1)).item() * 2
EARTH_RADIUS = 6371
DEG2RAD = PI/180

data = pd.read_csv('geodat.csv', header=0, names=['pageid', 'ns', 'title', 'coordinates', 'extract', 'lat', 'lon'])
data = data.dropna()
train_texts = data["extract"].values.tolist()
train_labels = data[["lat",  "lon"]].astype(float).values.tolist()
len(train_texts)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
# test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)


class GEODataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)
    
def haversine_dist(logits, labels):
    ## phi = lat = index 0, lambda = lon = index 1
    labels = DEG2RAD * labels
    d_sigma = 2 * (
        arcsin(
            sqrt(
                square(
                    sin(
                        (logits[:,0]-labels[:,0]) / 2 )
                ) 
                + 
                cos(logits[:,0]) 
                * 
                cos(labels[:,0]) 
                * 
                square(
                    sin(
                        (logits[:,1]-labels[:,1]) / 2 )
                ) 
            )
        )
    )
    hav_dist = EARTH_RADIUS * d_sigma
    hav_dist_mean = mean(hav_dist)
    return hav_dist_mean

train_dataset = GEODataset(train_encodings, train_labels)
val_dataset = GEODataset(val_encodings, val_labels)
# test_dataset = IMDbDataset(test_encodings, test_labels)


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = list(DataLoader(val_dataset, batch_size=8, shuffle=True))

optim = AdamW(model.parameters(), lr=5e-5)

writer = SummaryWriter(log_dir="logs")
losses = []
iteration = 0
for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels, output_hidden_states=True)
        logits = outputs.get('logits')
        loss_fct = haversine_dist
        train_loss = loss_fct(logits, labels)
        train_loss.backward()
        optim.step()
        train_loss_float = float(train_loss) 
        del input_ids, attention_mask, labels, logits, train_loss
        
        with torch.no_grad():
            val_batch = random.choice(val_loader)
            val_input_ids = val_batch['input_ids'].to(device)
            val_attention_mask = val_batch['attention_mask'].to(device)
            val_labels = val_batch['labels'].to(device)
            val_outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels, output_hidden_states=True)
            val_logits = val_outputs.get('logits')
            val_loss = loss_fct(val_logits, val_labels)
            val_loss_float = float(val_loss)
            del val_input_ids, val_attention_mask, val_labels, val_logits, val_loss
        
        losses.append([iteration, train_loss_float, val_loss_float])
        print(f"TRAIN: {train_loss_float:.3f}, VAL: {val_loss_float:.3f}")
        
        
        # writer.add_scalar('Loss/train', train_loss_float, iteration)
#         writer.add_scalar('Loss/test', val_loss_float, iteration)
        iteration +=1


model.eval()
model.save_pretrained(f'geobert-{dt.now()}.model')

# from transformers import pipeline
# nlp = pipeline('text-classification', model=model, tokenizer=tokenizer, return_all_scores=True, function_to_apply=None)
# nlp("Ich fahre durchs schöne Berlin")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

LOSS: 8987.1123046875
LOSS: 10395.85546875


KeyboardInterrupt: 

In [77]:
from transformers import pipeline

In [80]:
nlp = pipeline('text-classification', model=model, tokenizer=tokenizer, return_all_scores=True, function_to_apply=None)

In [81]:
nlp("Ich fahre durchs schöne Berlin")

[[{'label': 'LABEL_0', 'score': 0.7113227248191833},
  {'label': 'LABEL_1', 'score': 0.4895717203617096}]]