<a href="https://colab.research.google.com/github/asiabak/repozytorium1/blob/main/SVM_on_clean_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [8]:
import re
import pandas as pd

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove @mentions
    text = re.sub(r'@\w+', '', text)
    # Handle hashtags (keep the text without #)
    text = re.sub(r'#(\w+)', r'\1', text)
    # Remove special characters and extra whitespace
    text = re.sub(r'[^\w\s]', ' ', text)
    # Normalize whitespace
    text = ' '.join(text.split())
    # Convert to lowercase
    text = text.lower()
    return text.strip()

def process_file(input_path, output_path):
    # Read the file using pandas, specifying no header and column names
    df = pd.read_csv(input_path,
                     sep='\t',
                     header=None,
                     names=['latitude', 'longitude', 'text'])

    # Clean the text column
    df['text'] = df['text'].astype(str).apply(clean_text)

    # Save to new file, preserving tab separation and removing index
    df.to_csv(output_path, sep='\t', index=False, header=False)

# Example usage
input_file = "test_blind.txt"
output_file = "test_blind_clean.txt"

try:
    process_file(input_file, output_file)
    print(f"Successfully cleaned text and saved to {output_file}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

                                            latitude  longitude text
0  Und epis gfangä? Wo bisch gsi? Schächä? Nid be...        NaN  nan
1  Ich werde niemals Menschen verstehen, die eine...        NaN  nan
2  Immer wieder luschtig Die post woni als modera...        NaN  nan
3  Du denksch, din Job isch sinnlos? Es git lüüt ...        NaN  nan
4  kennt öber e guete Tättovierer? I bi no am Üeb...        NaN  nan
Successfully cleaned text and saved to test_blind_clean.txt


In [10]:
pip install fasttext

Collecting fasttext
  Using cached fasttext-0.9.3.tar.gz (73 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313475 sha256=ba23b778aab5c4e8d486b5db957567e0380a11e7f39f3d5d5eaa782007e3f1d7
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a513fa6b79451473ceb7713017823c3
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.3 pybind11-2.13.6


In [1]:
import fasttext
import fasttext.util
import numpy as np
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# File paths
TRAIN_FILE = 'train_clean.txt'
DEV_FILE = 'dev_clean.txt'
TEST_FILE = 'test_blind_clean.txt'

# Download and load FastText aligned vectors for German
# print("Downloading and loading FastText model...")
# fasttext.util.download_model('de', if_exists='ignore')  # Download aligned German vectors
ft = fasttext.load_model('cc.de.300.bin')

# Function to load training and development data (with labels)
def load_labeled_data(file_path):
    """Load data with latitude, longitude, and text."""
    sentences, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                try:
                    lat, lon = float(parts[0]), float(parts[1])
                    tweet = parts[2]
                    sentences.append(tweet)
                    labels.append((lat, lon))
                except ValueError:
                    continue
    X = np.array([ft.get_sentence_vector(tweet) for tweet in sentences])
    y = np.array(labels)
    return X, y

# Function to load test data (without labels)
def load_unlabeled_data(file_path):
    """Load data with only text (for predictions)."""
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            tweet = line.strip()
            if tweet:  # Ensure the line is not empty
                sentences.append(tweet)
    X = np.array([ft.get_sentence_vector(tweet) for tweet in sentences])
    return X

# Load training and development data
print("Loading training and development data...")
X_train, y_train = load_labeled_data(TRAIN_FILE)
X_dev, y_dev = load_labeled_data(DEV_FILE)

# Train the SVM model
print("Training SVM model...")
svm = MultiOutputRegressor(SVR(kernel='poly'))
svm.fit(X_train, y_train)

# Evaluate on the development set
print("Evaluating on development set...")
y_dev_pred = svm.predict(X_dev)
mse = mean_squared_error(y_dev, y_dev_pred, multioutput='raw_values')
print("Mean Squared Error for each label on dev set:", mse)
print("Average Mean Squared Error on dev set:", np.mean(mse))

# Load test data and make predictions
print("Loading test data and predicting...")
X_test = load_unlabeled_data(TEST_FILE)
predictions = svm.predict(X_test)

# Save predictions to a file
output_file = 'test_predictions.txt'
print(f"Saving predictions to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as f:
    for pred in predictions:
        f.write(f"{pred[0]}\t{pred[1]}\n")
print("Predictions saved successfully.")

Loading training and development data...
Training SVM model...
Evaluating on development set...
Mean Squared Error for each label on dev set: [0.04692826 0.30103779]
Average Mean Squared Error on dev set: 0.1739830209017215
Loading test data and predicting...
Saving predictions to test_predictions.txt...
Predictions saved successfully.
