<a href="https://colab.research.google.com/github/asiabak/Licencjat-modele/blob/main/SVM_with_fasttext_by_Claude.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# First, verify GPU is available in Colab
import tensorflow as tf
print("GPU Available:", tf.config.list_physical_devices('GPU'))

# Install necessary libraries if not already available
# !pip install gensim scikit-learn nltk pandas

import urllib.request
import pandas as pd
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from gensim.models import FastText
import tensorflow as tf
import time
import os

# Use GPU for tensor operations
with tf.device('/GPU:0'):
    # Download NLTK data if needed
    nltk.download('punkt_tab')

    # Set up paths for KGR10 FastText model
    model_url = "https://huggingface.co/clarin-pl/fasttext-kgr10/resolve/main/kgr10.plain.skipgram.dim100.neg10.bin"
    model_path = "kgr10.plain.skipgram.dim100.neg10.bin"

    # Download model if it doesn't exist
    if not os.path.exists(model_path):
        print(f"Downloading KGR10 FastText model from {model_url}...")
        urllib.request.urlretrieve(model_url, model_path)
        print("Download complete!")
    else:
        print(f"Using existing model at {model_path}")

    # Load the FastText model
    print("Loading KGR10 FastText model...")
    model = FastText.load_fasttext_format(model_path)
    print(f"Model loaded! Vector size: {model.vector_size}")

    # Load datasets
    print("Loading datasets...")
    reviews = pd.read_csv("filmweb_jednolity_sentyment.csv")
    reviews_with_idioms = pd.read_csv('filmweb_i_idiomy.csv')
    print(f"Loaded {len(reviews)} reviews and {len(reviews_with_idioms)} reviews with idioms")

    # Text preprocessing
    def preprocess_text(text):
        # Handle NaN values
        if isinstance(text, float) and np.isnan(text):
            return ""

        text = str(text).lower()
        text = re.sub(r'[^\w\s]', '', text)
        words = text.split()
        return ' '.join(words)

    # Apply text preprocessing
    print("Preprocessing text...")
    reviews['review_processed'] = reviews['review'].apply(preprocess_text)
    reviews_with_idioms['review_processed'] = reviews_with_idioms['review'].apply(preprocess_text)

    # GPU-accelerated vectorization with TensorFlow
    def vectorize_review_batch(texts, model):
        # Function to get embeddings for one text
        def get_embeddings(text):
            if not text:  # Handle empty strings
                return np.zeros(model.vector_size)

            words = word_tokenize(text.lower())
            word_vectors = []
            for word in words:
                try:
                    # Get vector for the word
                    word_vectors.append(model.wv[word])
                except KeyError:
                    # Skip words not in vocabulary
                    continue

            if word_vectors:
                # Convert to tensor and compute mean
                vectors_tensor = tf.convert_to_tensor(word_vectors, dtype=tf.float32)
                return tf.reduce_mean(vectors_tensor, axis=0).numpy()
            else:
                return np.zeros(model.vector_size)

        # Process each text in the batch
        return [get_embeddings(text) for text in texts]

    # Process reviews in batches to utilize GPU efficiently
    def process_in_batches(df, batch_size=128):
        vectors = []
        start_time = time.time()

        for i in range(0, len(df), batch_size):
            batch_texts = df['review_processed'].iloc[i:i+batch_size].values
            batch_vectors = vectorize_review_batch(batch_texts, model)
            vectors.extend(batch_vectors)

            # Print progress
            if (i+batch_size) % 1000 == 0 or i+batch_size >= len(df):
                elapsed = time.time() - start_time
                print(f"Processed {i+len(batch_texts)}/{len(df)} reviews in {elapsed:.2f} seconds")

        return vectors

    # Create vectors with GPU acceleration
    print("Vectorizing reviews using GPU...")
    reviews_vectors = process_in_batches(reviews)
    reviews_with_idioms_vectors = process_in_batches(reviews_with_idioms)

    # Store vectors in the dataframes
    reviews['vector'] = reviews_vectors
    reviews_with_idioms['vector'] = reviews_with_idioms_vectors

# Prepare data for training (back on CPU for scikit-learn)
print("Preparing data for SVM training...")
X = np.vstack(reviews['vector'].values)
y = reviews['sentiment'].values
X2 = np.vstack(reviews_with_idioms['vector'].values)
y2 = reviews_with_idioms['sentiment'].values

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.4, random_state=12)

# Train model on reviews only
print("Training SVM on reviews dataset...")
start_time = time.time()
model_svc = SVC(kernel='linear')
model_svc.fit(X_train, y_train)
print(f"SVM training completed in {time.time() - start_time:.2f} seconds")

# Predict and evaluate
y_pred = model_svc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for model trained on reviews only: {accuracy}")

# Train model on reviews + idioms
print("Training SVM on reviews + idioms dataset...")
start_time = time.time()
model_svc_idioms = SVC(kernel='linear')
model_svc_idioms.fit(X2_train, y2_train)
print(f"SVM training completed in {time.time() - start_time:.2f} seconds")

# Predict and evaluate
y2_pred = model_svc_idioms.predict(X2_test)
accuracy2 = accuracy_score(y2_test, y2_pred)
print(f"Accuracy for model trained on reviews + idioms: {accuracy2}")

# Compare model performance
print("\nModel Performance Comparison:")
print(f"Reviews Only:        {accuracy:.4f}")
print(f"Reviews with Idioms: {accuracy2:.4f}")
print(f"Improvement:         {(accuracy2-accuracy)*100:.2f}%")

# Save models if needed
import joblib
joblib.dump(model_svc, 'svm_model_reviews_kgr10.pkl')
joblib.dump(model_svc_idioms, 'svm_model_reviews_idioms_kgr10.pkl')

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Downloading KGR10 FastText model from https://huggingface.co/clarin-pl/fasttext-kgr10/resolve/main/kgr10.plain.skipgram.dim100.neg10.bin...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Download complete!
Loading KGR10 FastText model...


  model = FastText.load_fasttext_format(model_path)


Model loaded! Vector size: 100
Loading datasets...
Loaded 3903 reviews and 4403 reviews with idioms
Preprocessing text...
Vectorizing reviews using GPU...
Processed 3903/3903 reviews in 33.63 seconds
Processed 4403/4403 reviews in 34.38 seconds
Preparing data for SVM training...
Training SVM on reviews dataset...
SVM training completed in 0.41 seconds
Accuracy for model trained on reviews only: 0.5627400768245838
Training SVM on reviews + idioms dataset...
SVM training completed in 0.43 seconds
Accuracy for model trained on reviews + idioms: 0.5192962542565267

Model Performance Comparison:
Reviews Only:        0.5627
Reviews with Idioms: 0.5193
Improvement:         -4.34%


['svm_model_reviews_idioms_kgr10.pkl']