<a href="https://colab.research.google.com/github/asiabak/repozytorium1/blob/main/Claude_SVM_two_models_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

class GeoDataset(Dataset):
    def __init__(self, texts, coords=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.coords = coords
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

        if self.coords is not None:
            item['coords'] = torch.tensor(self.coords[idx], dtype=torch.float)

        return item

class GeoBERT(nn.Module):
    def __init__(self, bert_model="bert-base-uncased"):
        super(GeoBERT, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 2)  # 2 outputs for latitude and longitude
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(self.dropout(pooled_output))

def load_data(filename):
    """Load data from tab-separated files without headers"""
    try:
        # Load data without headers, using numeric column names
        data = pd.read_csv(filename, sep='\t', header=None)
        # Take first two columns as coordinates and third as text
        coords = data[[0, 1]].values  # latitude and longitude
        texts = data[2].values  # text
        return coords, texts
    except pd.errors.EmptyDataError:
        print(f"Error: {filename} is empty")
        return None, None
    except Exception as e:
        # For test_blind.txt which might only contain text
        try:
            data = pd.read_csv(filename, sep='\t', header=None)
            return None, data[0].values  # return only texts
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            return None, None

def train_model(model, train_loader, dev_loader, device, num_epochs=5, patience=3):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.MSELoss()

    best_val_loss = float('inf')
    patience_counter = 0  # Counter for early stopping

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coords = batch['coords'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, coords)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation on dev set
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in dev_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                coords = batch['coords'].to(device)

                outputs = model(input_ids, attention_mask)
                val_loss += criterion(outputs, coords).item()

        avg_train_loss = total_loss / len(train_loader)
        avg_val_loss = val_loss / len(dev_loader)

        print(f'Epoch {epoch+1}:')
        print(f'Average training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')

        # Check if validation loss improved
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pt')
            patience_counter = 0  # Reset patience counter
        else:
            patience_counter += 1

        # Early stopping condition
        if patience_counter >= patience:
            print("Early stopping triggered. Training halted.")
            break


def evaluate_model(model, test_loader, device):
    """Evaluate model on test set with known coordinates"""
    model.eval()
    all_preds = []
    all_coords = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coords = batch['coords'].to(device)

            outputs = model(input_ids, attention_mask)
            all_preds.extend(outputs.cpu().numpy())
            all_coords.extend(coords.cpu().numpy())

    mse = mean_squared_error(all_coords, all_preds)
    print(f'Test MSE: {mse:.4f}')
    return mse

def predict_blind(model, test_loader, device):
    """Generate predictions for blind test set"""
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask)
            predictions.extend(outputs.cpu().numpy())

    return np.array(predictions)

def main():
    # Load datasets
    train_coords, train_texts = load_data('train.txt')
    dev_coords, dev_texts = load_data('dev.txt')
    test_gold_coords, test_gold_texts = load_data('test_gold.txt')
    _, test_blind_texts = load_data('test_blind.txt')

    # Initialize tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Create datasets
    train_dataset = GeoDataset(train_texts, train_coords, tokenizer)
    dev_dataset = GeoDataset(dev_texts, dev_coords, tokenizer)
    test_gold_dataset = GeoDataset(test_gold_texts, test_gold_coords, tokenizer)
    test_blind_dataset = GeoDataset(test_blind_texts, tokenizer=tokenizer)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=16)
    test_gold_loader = DataLoader(test_gold_dataset, batch_size=16)
    test_blind_loader = DataLoader(test_blind_dataset, batch_size=16)

    # Initialize model and training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GeoBERT().to(device)

    # Train the model
    train_model(model, train_loader, dev_loader, device)

    # Evaluate on test set with known coordinates
    test_mse = evaluate_model(model, test_gold_loader, device)

    # Generate predictions for blind test set
    blind_predictions = predict_blind(model, test_blind_loader, device)

    # Save blind test predictions
    np.savetxt('blind_test_predictions.txt', blind_predictions, delimiter='\t', fmt='%.6f')

if __name__ == "__main__":
    main()

Epoch 1:
Average training loss: 197.8389
Average validation loss: 0.3499
Epoch 2:
Average training loss: 10.6624
Average validation loss: 0.2858
Epoch 3:
Average training loss: 9.2614
Average validation loss: 0.2925
Epoch 4:
Average training loss: 8.1959
Average validation loss: 0.2693
Epoch 5:
Average training loss: 7.3002
Average validation loss: 0.2879
Test MSE: 0.2847


In [1]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296229 sha256=71c2a16c41a1d0252aa13990777e6e578b771228d81bed7d868e3ff43c43dfab
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
import fasttext
import fasttext.util
from io import BytesIO
import requests
import warnings
warnings.filterwarnings('ignore')

def load_data(filename):
    """Load data from tab-separated files without headers"""
    try:
        data = pd.read_csv(filename, sep='\t', header=None)
        coords = data[[0, 1]].values  # latitude and longitude
        texts = data[2].values  # text
        return coords, texts
    except Exception as e:
        # For test_blind.txt which might only contain text
        try:
            data = pd.read_csv(filename, sep='\t', header=None)
            return None, data[0].values
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            return None, None

class GeoPredictor:
    def __init__(self, use_aligned=False):
        self.use_aligned = use_aligned
        self.ft_model = None
        self.svm_model = None

    def load_fasttext(self):
        """Load FastText model based on configuration"""
        if self.use_aligned:
            # Download aligned vectors directly from URL
            url = "https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.de.align.vec"
            response = requests.get(url, stream=True)
            vector_file = BytesIO(response.content)
            self.ft_model = fasttext.load_model(vector_file)
        else:
            # Load standard FastText model
            fasttext.util.download_model('en', if_exists='ignore')
            self.ft_model = fasttext.load_model('cc.en.300.bin')

    def get_text_vector(self, text):
        """Get vector representation for a text using FastText"""
        return self.ft_model.get_sentence_vector(str(text))

    def prepare_features(self, texts):
        """Convert all texts to FastText vectors"""
        return np.array([self.get_text_vector(text) for text in texts])

    def train(self, train_texts, train_coords, dev_texts, dev_coords):
        """Train SVM model"""
        print("Converting texts to vectors...")
        X_train = self.prepare_features(train_texts)
        X_dev = self.prepare_features(dev_texts)

        print("Training SVM model...")
        # Using SVR with RBF kernel for each coordinate
        base_svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
        self.svm_model = MultiOutputRegressor(base_svr)
        self.svm_model.fit(X_train, train_coords)

        # Evaluate on dev set
        dev_pred = self.svm_model.predict(X_dev)
        dev_mse = mean_squared_error(dev_coords, dev_pred)
        print(f"Development MSE: {dev_mse:.4f}")

        return dev_mse

    def evaluate(self, test_texts, test_coords):
        """Evaluate model on test set"""
        X_test = self.prepare_features(test_texts)
        test_pred = self.svm_model.predict(X_test)
        test_mse = mean_squared_error(test_coords, test_pred)
        print(f"Test MSE: {test_mse:.4f}")
        return test_mse

    def predict(self, texts):
        """Generate predictions for new texts"""
        X = self.prepare_features(texts)
        return self.svm_model.predict(X)

def main():
    # Load datasets
    train_coords, train_texts = load_data('train.txt')
    dev_coords, dev_texts = load_data('dev.txt')
    test_gold_coords, test_gold_texts = load_data('test_gold.txt')
    _, test_blind_texts = load_data('test_blind.txt')

    # Train and evaluate standard FastText model
    print("\nTraining model with standard FastText vectors...")
    standard_model = GeoPredictor(use_aligned=False)
    standard_model.load_fasttext()
    standard_dev_mse = standard_model.train(train_texts, train_coords, dev_texts, dev_coords)
    standard_test_mse = standard_model.evaluate(test_gold_texts, test_gold_coords)

    # Save predictions for blind test set
    standard_blind_pred = standard_model.predict(test_blind_texts)
    np.savetxt('blind_test_predictions_standard.txt', standard_blind_pred, delimiter='\t', fmt='%.6f')

    # Train and evaluate aligned FastText model
    print("\nTraining model with aligned FastText vectors...")
    aligned_model = GeoPredictor(use_aligned=True)
    aligned_model.load_fasttext()
    aligned_dev_mse = aligned_model.train(train_texts, train_coords, dev_texts, dev_coords)
    aligned_test_mse = aligned_model.evaluate(test_gold_texts, test_gold_coords)

    # Save predictions for blind test set
    aligned_blind_pred = aligned_model.predict(test_blind_texts)
    np.savetxt('blind_test_predictions_aligned.txt', aligned_blind_pred, delimiter='\t', fmt='%.6f')

    # Print comparison
    print("\nModel Comparison:")
    print(f"Standard FastText - Dev MSE: {standard_dev_mse:.4f}, Test MSE: {standard_test_mse:.4f}")
    print(f"Aligned FastText  - Dev MSE: {aligned_dev_mse:.4f}, Test MSE: {aligned_test_mse:.4f}")

if __name__ == "__main__":
    main()


Training model with standard FastText vectors...
Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
