<a href="https://colab.research.google.com/github/asiabak/repozytorium1/blob/main/First_session.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

class GeoDataset(Dataset):
    def __init__(self, texts, coords=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.coords = coords
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

        if self.coords is not None:
            item['coords'] = torch.tensor(self.coords[idx], dtype=torch.float)

        return item

class GeoBERT(nn.Module):
    def __init__(self, bert_model="bert-base-uncased"):
        super(GeoBERT, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 2)  # 2 outputs for latitude and longitude
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(self.dropout(pooled_output))

def load_data(filename):
    """Load data from tab-separated files"""
    try:
        data = pd.read_csv(filename, sep='\t', header=None, names=['latitude', 'longitude', 'text'])
        return data
    except:
        # For test_blind.txt which only contains text
        data = pd.read_csv(filename, sep='\t', header=None, names=['text'])
        return data

def train_model(model, train_loader, dev_loader, device, num_epochs=5):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.MSELoss()

    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coords = batch['coords'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, coords)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation on dev set
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in dev_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                coords = batch['coords'].to(device)

                outputs = model(input_ids, attention_mask)
                val_loss += criterion(outputs, coords).item()

        avg_train_loss = total_loss / len(train_loader)
        avg_val_loss = val_loss / len(dev_loader)
        print(f'Epoch {epoch+1}:')
        print(f'Average training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pt')

def evaluate_model(model, test_loader, device):
    """Evaluate model on test set with known coordinates"""
    model.eval()
    all_preds = []
    all_coords = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coords = batch['coords'].to(device)

            outputs = model(input_ids, attention_mask)
            all_preds.extend(outputs.cpu().numpy())
            all_coords.extend(coords.cpu().numpy())

    mse = mean_squared_error(all_coords, all_preds)
    print(f'Test MSE: {mse:.4f}')
    return mse

def predict_blind(model, test_loader, device):
    """Generate predictions for blind test set"""
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask)
            predictions.extend(outputs.cpu().numpy())

    return np.array(predictions)

def main():
    # Load datasets
    train_data = load_data('train.txt')
    dev_data = load_data('dev.txt')
    test_gold_data = load_data('test_gold.txt')
    test_blind_data = load_data('test_blind.txt')

    # Initialize tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Create datasets
    train_dataset = GeoDataset(
        train_data['text'].values,
        train_data[['latitude', 'longitude']].values,
        tokenizer
    )
    dev_dataset = GeoDataset(
        dev_data['text'].values,
        dev_data[['latitude', 'longitude']].values,
        tokenizer
    )
    test_gold_dataset = GeoDataset(
        test_gold_data['text'].values,
        test_gold_data[['latitude', 'longitude']].values,
        tokenizer
    )
    test_blind_dataset = GeoDataset(
        test_blind_data['text'].values,
        tokenizer=tokenizer
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=16)
    test_gold_loader = DataLoader(test_gold_dataset, batch_size=16)
    test_blind_loader = DataLoader(test_blind_dataset, batch_size=16)

    # Initialize model and training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GeoBERT().to(device)

    # Train the model
    train_model(model, train_loader, dev_loader, device)

    # Evaluate on test set with known coordinates
    test_mse = evaluate_model(model, test_gold_loader, device)

    # Generate predictions for blind test set
    blind_predictions = predict_blind(model, test_blind_loader, device)

    # Save blind test predictions
    pd.DataFrame(
        blind_predictions,
        columns=['predicted_latitude', 'predicted_longitude']
    ).to_csv('blind_test_predictions.csv', index=False)

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1:
Average training loss: 191.2955
Average validation loss: 0.2631
Epoch 2:
Average training loss: 9.0789
Average validation loss: 0.2770
Epoch 3:
Average training loss: 7.7435
Average validation loss: 0.2790


KeyboardInterrupt: 