<a href="https://colab.research.google.com/github/asiabak/repozytorium1/blob/BERT-training/BERT_with_text_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error
import warnings
import re
warnings.filterwarnings('ignore')

class GeoDataset(Dataset):
    def __init__(self, texts, coords=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.coords = coords
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

        if self.coords is not None:
            item['coords'] = torch.tensor(self.coords[idx], dtype=torch.float)

        return item

class GeoBERT(nn.Module):
    def __init__(self, bert_model="bert-base-uncased"):
        super(GeoBERT, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 2)  # 2 outputs for latitude and longitude
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(self.dropout(pooled_output))

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove @mentions
    text = re.sub(r'@\w+', '', text)
    # Handle hashtags (keep the text without #)
    text = re.sub(r'#(\w+)', r'\1', text)
    # Remove special characters and extra whitespace
    text = re.sub(r'[^\w\s]', ' ', text)
    # Normalize whitespace
    text = ' '.join(text.split())
    # Convert to lowercase
    text = text.lower()
    return text.strip()

def load_data(filename):
    """Load data from tab-separated files without headers"""
    try:
        # Load data without headers, using numeric column names
        data = pd.read_csv(filename, sep='\t', header=None)
        # Take first two columns as coordinates and third as text
        coords = data[[0, 1]].values  # latitude and longitude
        texts = data[2].values  # applying cleaning to text - now no
        return coords, texts
    except pd.errors.EmptyDataError:
        print(f"Error: {filename} is empty")
        return None, None
    except Exception as e:
        # For test_blind.txt which might only contain text
        try:
            data = pd.read_csv(filename, sep='\t', header=None)
            return None, data[0].values  # return only texts
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            return None, None

def train_model(model, train_loader, dev_loader, device, num_epochs=5, patience=3):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.MSELoss()

    best_val_loss = float('inf')
    patience_counter = 0  # Counter for early stopping

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coords = batch['coords'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, coords)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation on dev set
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in dev_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                coords = batch['coords'].to(device)

                outputs = model(input_ids, attention_mask)
                val_loss += criterion(outputs, coords).item()

        avg_train_loss = total_loss / len(train_loader)
        avg_val_loss = val_loss / len(dev_loader)

        print(f'Epoch {epoch+1}:')
        print(f'Average training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')

        # Check if validation loss improved
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pt')
            patience_counter = 0  # Reset patience counter
        else:
            patience_counter += 1

        # Early stopping condition
        if patience_counter >= patience:
            print("Early stopping triggered. Training halted.")
            break


def evaluate_model(model, test_loader, device):
    """Evaluate model on test set with known coordinates"""
    model.eval()
    all_preds = []
    all_coords = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coords = batch['coords'].to(device)

            outputs = model(input_ids, attention_mask)
            all_preds.extend(outputs.cpu().numpy())
            all_coords.extend(coords.cpu().numpy())

    mse = mean_squared_error(all_coords, all_preds)
    print(f'Test MSE: {mse:.4f}')
    return mse

def predict_blind(model, test_loader, device):
    """Generate predictions for blind test set"""
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask)
            predictions.extend(outputs.cpu().numpy())

    return np.array(predictions)

def main():
    # Load datasets
    train_coords, train_texts = load_data('train.txt')
    dev_coords, dev_texts = load_data('dev.txt')
    test_gold_coords, test_gold_texts = load_data('test_gold.txt')
    _, test_blind_texts = load_data('test_blind.txt')

    # Initialize tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Create datasets
    train_dataset = GeoDataset(train_texts, train_coords, tokenizer)
    dev_dataset = GeoDataset(dev_texts, dev_coords, tokenizer)
    test_gold_dataset = GeoDataset(test_gold_texts, test_gold_coords, tokenizer)
    test_blind_dataset = GeoDataset(test_blind_texts, tokenizer=tokenizer)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=16)
    test_gold_loader = DataLoader(test_gold_dataset, batch_size=16)
    test_blind_loader = DataLoader(test_blind_dataset, batch_size=16)

    # Initialize model and training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GeoBERT().to(device)

    # Train the model
    train_model(model, train_loader, dev_loader, device)

    # Evaluate on test set with known coordinates
    test_mse = evaluate_model(model, test_gold_loader, device)

    # Generate predictions for blind test set
    blind_predictions = predict_blind(model, test_blind_loader, device)

    # Save blind test predictions
    np.savetxt('blind_test_predictions_ch_with_cleaning.txt', blind_predictions, delimiter='\t', fmt='%.6f')

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1:
Average training loss: 182.0335
Average validation loss: 0.2783
Epoch 2:
Average training loss: 10.0117
Average validation loss: 0.3394
Epoch 3:
Average training loss: 8.6794
Average validation loss: 0.2634
Epoch 4:
Average training loss: 7.9111
Average validation loss: 0.3261
Epoch 5:
Average training loss: 7.5208
Average validation loss: 0.4209
Test MSE: 0.4173


In [None]:
file = "best_model_with_cleaning.pt"
model = GeoBERT()
model.load_state_dict(torch.load(file))

<All keys matched successfully>

In [None]:
model.eval()

GeoBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [None]:
import torch
from transformers import BertTokenizer
import numpy as np
from torch.utils.data import DataLoader

# Load your model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
file = "best_model.pt"
model = GeoBERT()
model.to(device)  # Move model to GPU if available
model.load_state_dict(torch.load(file))
model.eval()  # Set to evaluation mode

# Load and clean test_blind data
# def clean_text(text):
#     # Remove URLs
#     text = re.sub(r'http\S+|www\S+|https\S+', '', text)
#     # Remove @mentions
#     text = re.sub(r'@\w+', '', text)
#     # Handle hashtags (keep the text without #)
#     text = re.sub(r'#(\w+)', r'\1', text)
#     # Remove special characters and extra whitespace
#     text = re.sub(r'[^\w\s]', ' ', text)
#     # Normalize whitespace
#     text = ' '.join(text.split())
#     # Convert to lowercase
#     text = text.lower()
#     return text.strip()

# Load test_blind data
_, test_blind_texts = load_data('test_blind.txt')  # Using your existing load_data function

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset and dataloader for test_blind
test_blind_dataset = GeoDataset(test_blind_texts, coords=None, tokenizer=tokenizer)
test_blind_loader = DataLoader(test_blind_dataset, batch_size=16)

# Generate predictions
predictions = []
with torch.no_grad():
    for batch in test_blind_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask)
        predictions.extend(outputs.cpu().numpy())

# Convert predictions to numpy array
predictions = np.array(predictions)

# Save predictions to file
np.savetxt('blind_test_predictions_best_model1.txt', predictions, delimiter='\t', fmt='%.6f')

print(f"Predictions saved to 'blind_test_predictions_best_model1.txt'")
print(f"Shape of predictions: {predictions.shape}")

Predictions saved to 'blind_test_predictions_best_model1.txt'
Shape of predictions: (2438, 2)
