<a href="https://colab.research.google.com/github/asiabak/repozytorium1/blob/main/SVM_generated_by_GPT_fasttext_aligned_vectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296228 sha256=943884b53722a32fa775bf65fce2afb47b647b22207c11698a66bf926fd84430
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [2]:
import fasttext
import fasttext.util
import numpy as np
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# File paths
TRAIN_FILE = 'train.txt'
DEV_FILE = 'dev.txt'
TEST_FILE = 'test_blind.txt'

# Download and load FastText aligned vectors for German
print("Downloading and loading FastText model...")
fasttext.util.download_model('de', if_exists='ignore')  # Download aligned German vectors
ft = fasttext.load_model('cc.de.300.bin')

# Function to load training and development data (with labels)
def load_labeled_data(file_path):
    """Load data with latitude, longitude, and text."""
    sentences, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                try:
                    lat, lon = float(parts[0]), float(parts[1])
                    tweet = parts[2]
                    sentences.append(tweet)
                    labels.append((lat, lon))
                except ValueError:
                    continue
    X = np.array([ft.get_sentence_vector(tweet) for tweet in sentences])
    y = np.array(labels)
    return X, y

# Function to load test data (without labels)
def load_unlabeled_data(file_path):
    """Load data with only text (for predictions)."""
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            tweet = line.strip()
            if tweet:  # Ensure the line is not empty
                sentences.append(tweet)
    X = np.array([ft.get_sentence_vector(tweet) for tweet in sentences])
    return X

# Load training and development data
print("Loading training and development data...")
X_train, y_train = load_labeled_data(TRAIN_FILE)
X_dev, y_dev = load_labeled_data(DEV_FILE)

# Train the SVM model
print("Training SVM model...")
svm = MultiOutputRegressor(SVR(kernel='rbf'))
svm.fit(X_train, y_train)

# Evaluate on the development set
print("Evaluating on development set...")
y_dev_pred = svm.predict(X_dev)
mse = mean_squared_error(y_dev, y_dev_pred, multioutput='raw_values')
print("Mean Squared Error for each label on dev set:", mse)
print("Average Mean Squared Error on dev set:", np.mean(mse))

# Load test data and make predictions
print("Loading test data and predicting...")
X_test = load_unlabeled_data(TEST_FILE)
predictions = svm.predict(X_test)

# Save predictions to a file
output_file = 'test_predictions.txt'
print(f"Saving predictions to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as f:
    for pred in predictions:
        f.write(f"{pred[0]}\t{pred[1]}\n")
print("Predictions saved successfully.")


Downloading and loading FastText model...
Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz

Loading training and development data...
Training SVM model...
Evaluating on development set...
Mean Squared Error for each label on dev set: [0.04801686 0.29895594]
Average Mean Squared Error on dev set: 0.1734864019354707
Loading test data and predicting...
Saving predictions to test_predictions.txt...
Predictions saved successfully.
