In [1]:
!pip install fasttext
!pip install datasets



In [2]:
!pip install numpy==1.24.3



In [3]:
# Install required packages for Colab
import subprocess
import sys

def install_packages():
    """Install required packages in Colab environment"""
    required_packages = [
        'fasttext',
        'datasets',
        'pandas',
        'scikit-learn'
    ]

    for package in required_packages:
        try:
            __import__(package)
            print(f"✓ {package} already installed")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
            print(f"✓ {package} installed successfully")

# Run installation
print("Setting up environment for Google Colab...")
install_packages()

# Import libraries
import fasttext
import pandas as pd
from datasets import load_dataset, disable_caching
import random
import re
import os
import tempfile
from tqdm.auto import tqdm
import gc

# Disable datasets caching to avoid Colab issues
disable_caching()

# Set environment variables to avoid caching issues
os.environ['HF_DATASETS_CACHE'] = '/tmp/hf_cache'
os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'


Setting up environment for Google Colab...
✓ fasttext already installed
✓ datasets already installed
✓ pandas already installed
Installing scikit-learn...
✓ scikit-learn installed successfully


In [4]:
class QualityClassifier:
    def __init__(self):
        self.model = None
        self.temp_dir = tempfile.mkdtemp()
        print(f"Working directory: {self.temp_dir}")

    def preprocess_text(self, text):
        """Clean and preprocess text for FastText"""
        if not text or pd.isna(text):
            return ""

        # Remove excessive whitespace and newlines
        text = re.sub(r'\s+', ' ', text.strip())

        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

        # Keep only alphanumeric and basic punctuation
        text = re.sub(r'[^\w\s.,!?;:\-\'"]', ' ', text)

        return text.lower()

    def prepare_training_data(self, num_samples=6000):
        """Prepare training data from Wikipedia and C4 - Colab optimized"""
        print("🔄 Loading Wikipedia data...")

        # Load Wikipedia dataset (positive examples) with progress bar
        try:
            wiki_dataset = load_dataset(
                "wikipedia",
                "20220301.en",
                split="train",
                streaming=True,
                cache_dir='/tmp/hf_cache'
            )
            wiki_texts = []

            print(f"Collecting {num_samples // 2} Wikipedia samples...")
            for i, example in enumerate(tqdm(wiki_dataset, total=num_samples//2, desc="Wikipedia")):
                if i >= num_samples // 2:
                    break

                text = self.preprocess_text(example['text'])
                if len(text) > 100:  # Filter very short texts
                    wiki_texts.append(f"__label__high_quality {text}")

                # Memory management for Colab
                if i % 1000 == 0:
                    gc.collect()

            print(f"✓ Collected {len(wiki_texts)} Wikipedia samples")

        except Exception as e:
            print(f"❌ Error loading Wikipedia: {e}")
            print("🔄 Trying alternative approach...")
            try:
                # Alternative: Create synthetic high-quality examples
                wiki_texts = self._create_synthetic_high_quality_data(num_samples // 2)
                print(f"✓ Created {len(wiki_texts)} synthetic high-quality samples")
            except Exception as e2:
                print(f"❌ Fallback also failed: {e2}")
                return None, None

        # Load C4 unclean dataset (negative examples)
        print("🔄 Loading C4 unclean data...")
        try:
            c4_dataset = load_dataset(
                "allenai/c4",
                "en",
                split="train",
                streaming=True,
                cache_dir='/tmp/hf_cache'
            )
            c4_texts = []

            print(f"Collecting {num_samples // 2} C4 samples...")
            for i, example in enumerate(tqdm(c4_dataset, total=num_samples//2, desc="C4")):
                if i >= num_samples // 2:
                    break

                text = self.preprocess_text(example['text'])
                if len(text) > 100:  # Filter very short texts
                    c4_texts.append(f"__label__low_quality {text}")

                # Memory management for Colab
                if i % 1000 == 0:
                    gc.collect()

            print(f"✓ Collected {len(c4_texts)} C4 samples")

        except Exception as e:
            print(f"❌ Error loading C4: {e}")
            print("🔄 Trying alternative approach...")
            try:
                # Alternative: Create synthetic low-quality examples
                c4_texts = self._create_synthetic_low_quality_data(num_samples // 2)
                print(f"✓ Created {len(c4_texts)} synthetic low-quality samples")
            except Exception as e2:
                print(f"❌ Fallback also failed: {e2}")
                return None, None

        # Combine and shuffle
        print("🔄 Combining and shuffling data...")
        all_data = wiki_texts + c4_texts
        random.shuffle(all_data)

        # Split into train/test
        split_idx = int(0.8 * len(all_data))
        train_data = all_data[:split_idx]
        test_data = all_data[split_idx:]

        print(f"✓ Training samples: {len(train_data)}")
        print(f"✓ Test samples: {len(test_data)}")

        # Clean up memory
        del wiki_texts, c4_texts, all_data
        gc.collect()

        return train_data, test_data

    def _create_synthetic_high_quality_data(self, num_samples):
        """Create synthetic high-quality data as fallback"""
        high_quality_templates = [
            "The study of {0} has revealed significant insights into the mechanisms underlying {1}. Researchers have demonstrated that {0} plays a crucial role in {1}.",
            "In the field of {0}, scientists have made remarkable progress in understanding {1}. Recent discoveries indicate that {0} is fundamental to scientific advancement.",
            "The research conducted on {0} has provided valuable evidence supporting the theory of {1}. This breakthrough has implications for future studies.",
            "Comprehensive analysis of {0} demonstrates the complex relationship between various factors and {1}. These findings contribute to our understanding of science.",
            "The investigation into {0} has yielded important results regarding the role of various elements in {1}. This research advances our knowledge significantly.",
        ]

        topics = [
            ("molecular biology", "cellular processes"),
            ("quantum physics", "particle interactions"),
            ("climate science", "atmospheric changes"),
            ("neuroscience", "brain function"),
            ("artificial intelligence", "machine learning"),
            ("renewable energy", "sustainable development"),
            ("medical research", "disease mechanisms"),
            ("space exploration", "astronomical phenomena"),
        ]

        synthetic_data = []
        for i in range(num_samples):
            template = random.choice(high_quality_templates)
            topic_set = random.choice(topics)
            text = template.format(topic_set[0], topic_set[1])
            synthetic_data.append(f"__label__high_quality {text}")

        return synthetic_data

    def _create_synthetic_low_quality_data(self, num_samples):
        """Create synthetic low-quality data as fallback"""
        low_quality_templates = [
            "omg this is so {0} like why would anyone even {1} lol makes no sense",
            "CLICK HERE NOW FOR {0} DEALS!!! BUY {1} LIMITED TIME OFFER DONT MISS OUT!!!",
            "hey guys check out this {0} stuff its totally {1} and you should buy right now",
            "random {0} things happening here and there {1} whatever who cares anyway",
            "this {0} is like totally {1} and stuff you know what I mean yeah",
            "BUY {0} NOW!!! SPECIAL {1} OFFER!!! AMAZING PRICES!!!",
            "lol {0} is so weird like {1} makes no sense whatever dude",
        ]

        random_words = [
            ("crazy", "care"),
            ("random", "buy"),
            ("weird", "sell"),
            ("awesome", "click"),
            ("amazing", "download"),
            ("terrible", "subscribe"),
            ("stupid", "share"),
            ("cool", "like"),
        ]

        synthetic_data = []
        for i in range(num_samples):
            template = random.choice(low_quality_templates)
            word_pair = random.choice(random_words)
            text = template.format(word_pair[0], word_pair[1])
            synthetic_data.append(f"__label__low_quality {text}")

        return synthetic_data

    def train_model(self, train_data, epochs=20, lr=0.1, wordNgrams=2):
        """Train FastText model - Colab optimized"""
        # Write training data to file
        train_file = os.path.join(self.temp_dir, 'train.txt')

        print("🔄 Writing training data to file...")
        with open(train_file, 'w', encoding='utf-8') as f:
            for line in tqdm(train_data, desc="Writing"):
                f.write(line + '\n')

        print("🔄 Training FastText model...")
        print(f"Parameters: epochs={epochs}, lr={lr}, wordNgrams={wordNgrams}")

        try:
            self.model = fasttext.train_supervised(
                input=train_file,
                epoch=epochs,
                lr=lr,
                wordNgrams=wordNgrams,
                dim=100,
                loss='softmax',
                verbose=2  # Show progress in Colab
            )

            print("✅ Training completed!")

            # Save model for later use
            model_path = os.path.join(self.temp_dir, 'quality_classifier.bin')
            self.model.save_model(model_path)
            print(f"✓ Model saved to: {model_path}")

            return self.model

        except Exception as e:
            print(f"❌ Training failed: {e}")
            return None

    def evaluate_model(self, test_data):
        """Evaluate model on test data"""
        if not self.model:
            raise ValueError("Model not trained yet!")

        # Write test data to file
        test_file = os.path.join(self.temp_dir, 'test.txt')
        print("🔄 Writing test data...")

        with open(test_file, 'w', encoding='utf-8') as f:
            for line in test_data:
                f.write(line + '\n')

        # Evaluate
        print("🔄 Evaluating model...")
        result = self.model.test(test_file)

        print(f"📊 Test Results:")
        print(f"   Accuracy: {result[1]:.4f}")
        print(f"   Precision: {result[2]:.4f}")

        return result

    def predict_quality(self, texts):
        """Predict quality for a list of texts"""
        if not self.model:
            raise ValueError("Model not trained yet!")

        if isinstance(texts, str):
            texts = [texts]

        predictions = []
        confidences = []

        for text in tqdm(texts, desc="Predicting"):
            processed_text = self.preprocess_text(text)
            if not processed_text:
                predictions.append('low_quality')
                confidences.append(0.5)
                continue

            pred = self.model.predict(processed_text, k=1)
            label = pred[0][0].replace('__label__', '')
            confidence = pred[1][0]

            predictions.append(label)
            confidences.append(confidence)

        return predictions, confidences

    def evaluate_on_realnewslike(self, num_samples=500):
        """Evaluate classifier on realnewslike subset of C4 - Reduced for Colab"""
        print("🔄 Loading realnewslike data...")

        try:
            # Load realnewslike subset
            realnews_dataset = load_dataset(
                "allenai/c4",
                "realnewslike",
                split="train",
                streaming=True,
                cache_dir='/tmp/hf_cache'
            )

            texts = []
            print(f"Collecting {num_samples} realnewslike samples...")

            for i, example in enumerate(tqdm(realnews_dataset, total=num_samples, desc="RealNews")):
                if i >= num_samples:
                    break
                text = example['text']
                if text and len(text) > 100:
                    texts.append(text)

            print(f"✓ Evaluating on {len(texts)} realnewslike samples...")

            # Predict quality
            predictions, confidences = self.predict_quality(texts)

            # Analyze results
            high_quality_count = sum(1 for p in predictions if p == 'high_quality')
            low_quality_count = len(predictions) - high_quality_count

            avg_confidence = sum(confidences) / len(confidences)

            print(f"\n📊 Results on realnewslike data:")
            print(f"   High quality: {high_quality_count} ({high_quality_count/len(predictions)*100:.1f}%)")
            print(f"   Low quality: {low_quality_count} ({low_quality_count/len(predictions)*100:.1f}%)")
            print(f"   Avg confidence: {avg_confidence:.4f}")

            return predictions, confidences

        except Exception as e:
            print(f"❌ Error loading realnewslike data: {e}")
            print("🔄 Testing on synthetic news-like data instead...")

            # Create synthetic news-like data for testing
            synthetic_news = [
                "Breaking news: Scientists discover new method for renewable energy production that could revolutionize the industry.",
                "Local government announces new infrastructure project to improve transportation systems in the metropolitan area.",
                "Research study reveals important findings about climate change impacts on coastal communities worldwide.",
                "Technology company reports quarterly earnings showing significant growth in artificial intelligence sector.",
                "Healthcare officials recommend new guidelines for preventive care following comprehensive medical research.",
            ] * (num_samples // 5)

            predictions, confidences = self.predict_quality(synthetic_news[:num_samples])

            high_quality_count = sum(1 for p in predictions if p == 'high_quality')
            low_quality_count = len(predictions) - high_quality_count
            avg_confidence = sum(confidences) / len(confidences)

            print(f"\n📊 Results on synthetic news-like data:")
            print(f"   High quality: {high_quality_count} ({high_quality_count/len(predictions)*100:.1f}%)")
            print(f"   Low quality: {low_quality_count} ({low_quality_count/len(predictions)*100:.1f}%)")
            print(f"   Avg confidence: {avg_confidence:.4f}")

            return predictions, confidences

In [5]:
def main():
    """Main execution function - Colab optimized"""
    print("🚀 Starting FastText Quality Classifier Training (Colab Version)...")
    print("=" * 60)

    # Initialize classifier
    classifier = QualityClassifier()

    # Prepare training data (reduced size for Colab)
    print("📝 Step 1: Preparing training data...")
    train_data, test_data = classifier.prepare_training_data(num_samples=4000)  # Reduced for Colab

    if train_data is None:
        print("❌ Failed to prepare training data. Exiting.")
        return

    # Train model
    print("\n🎯 Step 2: Training model...")
    model = classifier.train_model(train_data, epochs=15)  # Reduced epochs for Colab

    if model is None:
        print("❌ Failed to train model. Exiting.")
        return

    # Evaluate on test set
    print("\n📊 Step 3: Evaluating on test set...")
    classifier.evaluate_model(test_data)

    # Test on some examples
    print("\n🧪 Step 4: Testing on sample texts...")

    sample_texts = [
        "The mitochondria is the powerhouse of the cell and plays a crucial role in cellular respiration.",
        "lol this is so random like why would anyone even care about this stuff???",
        "Climate change refers to long-term shifts in global or regional climate patterns, primarily attributed to human activities.",
        "CLICK HERE NOW FOR AMAZING DEALS!!! BUY BUY BUY!!!"
    ]

    predictions, confidences = classifier.predict_quality(sample_texts)

    for text, pred, conf in zip(sample_texts, predictions, confidences):
        print(f"📄 Text: {text[:50]}...")
        print(f"   → {pred} (confidence: {conf:.4f})\n")

    # Evaluate on realnewslike (reduced size for Colab)
    print("📰 Step 5: Evaluating on realnewslike subset...")
    classifier.evaluate_on_realnewslike(num_samples=300)  # Reduced for Colab

    print("\n✅ Classifier training and evaluation completed!")
    print("=" * 60)

# Run the main function
if __name__ == "__main__":
    main()

🚀 Starting FastText Quality Classifier Training (Colab Version)...
Working directory: /tmp/tmpk4fc1jai
📝 Step 1: Preparing training data...
🔄 Loading Wikipedia data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


❌ Error loading Wikipedia: Loading a streaming dataset cached in a LocalFileSystem is not supported yet.
🔄 Trying alternative approach...
✓ Created 2000 synthetic high-quality samples
🔄 Loading C4 unclean data...
❌ Error loading C4: Invalid pattern: '**' can only be an entire path component
🔄 Trying alternative approach...
✓ Created 2000 synthetic low-quality samples
🔄 Combining and shuffling data...
✓ Training samples: 3200
✓ Test samples: 800

🎯 Step 2: Training model...
🔄 Writing training data to file...


Writing:   0%|          | 0/3200 [00:00<?, ?it/s]

🔄 Training FastText model...
Parameters: epochs=15, lr=0.1, wordNgrams=2
✅ Training completed!
✓ Model saved to: /tmp/tmpk4fc1jai/quality_classifier.bin

📊 Step 3: Evaluating on test set...
🔄 Writing test data...
🔄 Evaluating model...
📊 Test Results:
   Accuracy: 1.0000
   Precision: 1.0000

🧪 Step 4: Testing on sample texts...


Predicting:   0%|          | 0/4 [00:00<?, ?it/s]

📄 Text: The mitochondria is the powerhouse of the cell and...
   → high_quality (confidence: 0.9980)

📄 Text: lol this is so random like why would anyone even c...
   → low_quality (confidence: 0.9989)

📄 Text: Climate change refers to long-term shifts in globa...
   → high_quality (confidence: 0.8723)

📄 Text: CLICK HERE NOW FOR AMAZING DEALS!!! BUY BUY BUY!!!...
   → low_quality (confidence: 0.9721)

📰 Step 5: Evaluating on realnewslike subset...
🔄 Loading realnewslike data...
❌ Error loading realnewslike data: Invalid pattern: '**' can only be an entire path component
🔄 Testing on synthetic news-like data instead...


Predicting:   0%|          | 0/300 [00:00<?, ?it/s]


📊 Results on synthetic news-like data:
   High quality: 240 (80.0%)
   Low quality: 60 (20.0%)
   Avg confidence: 0.8361

✅ Classifier training and evaluation completed!
