In [1]:
!pip install -q "spacy" "numpy<2.0" "scikit-learn" "pandas" "tqdm"
!python -m spacy download en_core_web_sm

print("Libraries installed successfully.")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Libraries installed successfully.


In [3]:
import pandas as pd
import numpy as np
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from tqdm.auto import tqdm
import os
import shutil
import time
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')



print("Loading and preparing datasets...")

def load_and_prepare_data(sample_fraction=0.15):
    """Load and prepare data"""
    try:
        # Dataset paths
        path_daigt = '/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv'
        path_llm = '/kaggle/input/llm-detect-ai-generated-text-dataset/Training_Essay_Data.csv' 
        path_ai_human = '/kaggle/input/ai-vs-human-text/AI_Human.csv'
        
        print(f"   Loading {sample_fraction*100}% of data for faster training...")
        
        print("   Loading DAIGT dataset...")
        df_daigt = pd.read_csv(path_daigt, nrows=int(119757 * sample_fraction))
        df_daigt = df_daigt[['text', 'label']].dropna()
        print(f"     DAIGT samples: {len(df_daigt):,}")
        
        print("   Loading LLM dataset...")
        df_llm = pd.read_csv(path_llm, nrows=int(44868 * sample_fraction))
        df_llm = df_llm.rename(columns={'generated': 'label'})[['text', 'label']].dropna()
        print(f"     LLM samples: {len(df_llm):,}")
        
        print("   Loading AI-Human dataset...")
        df_ai_human = pd.read_csv(path_ai_human, nrows=int(200000 * sample_fraction))
        df_ai_human = df_ai_human.rename(columns={'generated': 'label'})
        df_ai_human = df_ai_human[['text', 'label']].dropna()
        df_ai_human['label'] = df_ai_human['label'].astype(int)
        print(f"     AI-Human samples: {len(df_ai_human):,}")
        
        print("   Combining datasets...")
        combined_df = pd.concat([df_daigt, df_llm, df_ai_human], ignore_index=True)
        print(f"     Combined total: {len(combined_df):,}")
        
        # Remove duplicates
        combined_df = combined_df.drop_duplicates(subset='text').reset_index(drop=True)
        print(f"     After deduplication: {len(combined_df):,}")
        
        # Balance dataset for better training
        print("   • Balancing dataset...")
        df_human = combined_df[combined_df.label == 0]  # Human text
        df_ai = combined_df[combined_df.label == 1]     # AI text
        
        print(f"     Before balancing - Human: {len(df_human):,}, AI: {len(df_ai):,}")
        
      
        min_count = min(len(df_human), len(df_ai))
        max_per_class = min_count 
        
        df_human_balanced = df_human.sample(n=max_per_class, random_state=42)
        df_ai_balanced = df_ai.sample(n=max_per_class, random_state=42)
        
        # Combine balanced data
        final_df = pd.concat([df_human_balanced, df_ai_balanced]).reset_index(drop=True)
        final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle
        
        print(f"Final dataset prepared: {len(final_df):,} samples")
        print(f"   Human text: {(final_df.label==0).sum():,}")
        print(f"   AI text: {(final_df.label==1).sum():,}")
        print(f"   Balance ratio: {(final_df.label==1).sum()/(final_df.label==0).sum():.3f}")
        
        return final_df
        
    except FileNotFoundError as e:
        print(f"❌ Dataset file not found: {e}")
        print("   Please ensure all datasets are added to your Kaggle notebook.")
        return None
    except Exception as e:
        print(f"❌ Data loading failed: {e}")
        return None

# Execute data loading
df = load_and_prepare_data(sample_fraction = 0.8)

if df is None:
    raise Exception("❌ Data loading failed! Please check dataset paths.")
    
print(f"\nData loading completed successfully!")
print(f"Ready for training with {len(df):,} balanced samples.")

Loading and preparing datasets...
   Loading 80.0% of data for faster training...
   Loading DAIGT dataset...
     DAIGT samples: 44,868
   Loading LLM dataset...
     LLM samples: 29,145
   Loading AI-Human dataset...
     AI-Human samples: 160,000
   Combining datasets...
     Combined total: 234,013
     After deduplication: 166,678
   • Balancing dataset...
     Before balancing - Human: 91,889, AI: 74,789
Final dataset prepared: 149,578 samples
   Human text: 74,789
   AI text: 74,789
   Balance ratio: 1.000

Data loading completed successfully!
Ready for training with 149,578 balanced samples.


In [4]:
def train_spacy_classifier_FIXED(df, max_samples_per_class=6000):
    """
    SpaCy text classifier
    This version addresses the model collapse issue from your original code
    """
    
    print("SpaCy Training Started")
    print("=" * 50)
    
    # Prepare balanced dataset
    df_human = df[df.label == 0].sample(
        n=min(max_samples_per_class, len(df[df.label == 0])), 
        random_state=42
    )
    df_ai = df[df.label == 1].sample(
        n=min(max_samples_per_class, len(df[df.label == 1])), 
        random_state=42
    )
    
    balanced_df = pd.concat([df_human, df_ai]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"Training Configuration:")
    print(f"   Total samples: {len(balanced_df):,}")
    print(f"   Human samples: {len(df_human):,}")
    print(f"   AI samples: {len(df_ai):,}")
    print(f"   Perfect balance: {len(df_human) == len(df_ai)}")
    
    # Split data (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(
        balanced_df['text'], balanced_df['label'], 
        test_size=0.2, random_state=42, stratify=balanced_df['label']
    )
    
    print(f"   Training samples: {len(X_train):,}")
    print(f"   Testing samples: {len(X_test):,}")
    
    # Verify train/test balance
    train_balance = pd.Series(y_train).value_counts().sort_index()
    test_balance = pd.Series(y_test).value_counts().sort_index()
    print(f"   Train balance - Human: {train_balance[0]}, AI: {train_balance[1]}")
    print(f"   Test balance - Human: {test_balance[0]}, AI: {test_balance[1]}")
    
    # Create SpaCy model (blank English model)
    print("\n Creating SpaCy model...")
    nlp = spacy.blank("en")
    
    # Add text classifier with FIXED configuration
    textcat = nlp.add_pipe("textcat", config={
        "model": {
            "@architectures": "spacy.TextCatBOW.v3",  # Use v3 (more stable)
            "exclusive_classes": True,  # Either AI or Human, not both
            "ngram_size": 2,  # Use bigrams for better feature extraction
            "no_output_layer": False
        }
    })
    
    # Add labels (CRITICAL: Use string labels, not numeric)
    textcat.add_label("HUMAN")  # Label for human text
    textcat.add_label("AI")     # Label for AI text
    
    print("   • Architecture: TextCatBOW.v3 with exclusive classification")
    print("   • Features: Bigrams (ngram_size=2)")
    print("   • Labels: HUMAN, AI")
    
    # Prepare training examples (FIXED format)
    print("\nPreparing training examples...")
    train_examples = []
    
    for i, (text, label) in enumerate(zip(X_train, y_train)):
        # Create document
        doc = nlp.make_doc(str(text))
        
        # Create categories (CRITICAL: Proper format for exclusive classification)
        cats = {
            "HUMAN": 1.0 if label == 0 else 0.0, 
            "AI": 1.0 if label == 1 else 0.0
        }
        
        # Create training example
        example = Example.from_dict(doc, {"cats": cats})
        train_examples.append(example)
        
        if (i + 1) % 1000 == 0:
            print(f"   • Prepared {i+1:,} examples...")
    
    print(f"{len(train_examples):,} training examples prepared")
    
    # Initialize model (CRITICAL: Must do this before training)
    print("\nInitializing model...")
    nlp.initialize()
    print("   Model initialized successfully")
    
    # FIXED Training loop
    print("\nStarting training loop...")
    
    # Training parameters (optimized for accuracy and speed)
    n_epochs = 10          # Sufficient epochs for convergence
    batch_size = 32        # Smaller batches for better gradient updates
    dropout_rate = 0.3     # Prevent overfitting
    learning_rate = 0.001  # Conservative learning rate
    
    print(f"   • Epochs: {n_epochs}")
    print(f"   • Batch size: {batch_size}")
    print(f"   • Dropout rate: {dropout_rate}")
    print(f"   • Learning rate: {learning_rate}")
    
    # Training loop with progress tracking
    for epoch in range(n_epochs):
        epoch_start = time.time()
        losses = {}
        
        # Shuffle training examples each epoch
        train_examples_shuffled = train_examples.copy()
        np.random.shuffle(train_examples_shuffled)
        
        # Process in batches
        n_batches = len(train_examples_shuffled) // batch_size
        
        for i in range(0, len(train_examples_shuffled), batch_size):
            batch = train_examples_shuffled[i:i+batch_size]
            nlp.update(batch, drop=dropout_rate, losses=losses)
        
        epoch_time = time.time() - epoch_start
        
        # Evaluate every 2 epochs
        if (epoch + 1) % 2 == 0:
            # Quick accuracy check on sample
            correct = 0
            total = 0
            sample_size = min(200, len(X_test))
            sample_indices = np.random.choice(len(X_test), sample_size, replace=False)
            
            for idx in sample_indices:
                doc = nlp(str(X_test.iloc[idx]))
                
                # Get prediction (FIXED: Check which class has higher score)
                ai_score = doc.cats["AI"]
                human_score = doc.cats["HUMAN"]
                predicted = 1 if ai_score > human_score else 0
                actual = y_test.iloc[idx]
                
                if predicted == actual:
                    correct += 1
                total += 1
            
            accuracy = correct / total
            loss_value = losses.get('textcat', 0)
            
            print(f"   📊 Epoch {epoch+1:2d}: Loss={loss_value:.4f}, "
                  f"Accuracy={accuracy:.4f} ({accuracy*100:.1f}%), Time={epoch_time:.1f}s")
        else:
            loss_value = losses.get('textcat', 0)
            print(f"   📊 Epoch {epoch+1:2d}: Loss={loss_value:.4f}, Time={epoch_time:.1f}s")
        
        # Memory cleanup
        gc.collect()
    
    print("Training completed successfully!")
    
    # Full evaluation on test set
    print(f"\nFinal Model Evaluation...")
    print("   • Running full test set evaluation...")
    
    predictions = []
    confidences_ai = []
    confidences_human = []
    
    for text in tqdm(X_test, desc="Evaluating model", leave=False):
        doc = nlp(str(text))
        
        ai_confidence = doc.cats["AI"]
        human_confidence = doc.cats["HUMAN"]
        
        # Prediction based on higher confidence
        predicted = 1 if ai_confidence > human_confidence else 0
        
        predictions.append(predicted)
        confidences_ai.append(ai_confidence)
        confidences_human.append(human_confidence)
    
    # Calculate final metrics
    final_accuracy = accuracy_score(y_test, predictions)
    
    print(f"\nFINAL EVALUATION RESULTS:")
    print(f"=" * 40)
    print(f"Overall Accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
    print(f"Test Samples: {len(X_test):,}")
    
    # Detailed classification report
    print(f"\nDetailed Classification Report:")
    print(classification_report(y_test, predictions, 
                              target_names=['Human', 'AI'], 
                              digits=4))
    
    # Confusion matrix (FIXED: This should now show predictions for both classes)
    cm = confusion_matrix(y_test, predictions)
    print(f"\nConfusion Matrix:")
    print(f"                 Predicted")
    print(f"Actual       Human      AI")
    print(f"Human     {cm[0,0]:8d}  {cm[0,1]:8d}")
    print(f"AI        {cm[1,0]:8d}  {cm[1,1]:8d}")
    
    # Confidence statistics
    print(f"\n📈 Confidence Statistics:")
    print(f"   • Mean AI confidence: {np.mean(confidences_ai):.4f}")
    print(f"   • Mean Human confidence: {np.mean(confidences_human):.4f}")
    print(f"   • Std AI confidence: {np.std(confidences_ai):.4f}")
    print(f"   • Std Human confidence: {np.std(confidences_human):.4f}")
    
    return nlp, final_accuracy, cm, X_test, y_test

print("Training function defined and ready!")

Training function defined and ready!


In [5]:
# Training configuration
MAX_SAMPLES_PER_CLASS = 74789 # from the previous cell
print(f"Training Configuration:")
print(f"   Max samples per class: {MAX_SAMPLES_PER_CLASS:,}")
print(f"   Total training samples: ~{MAX_SAMPLES_PER_CLASS * 2:,}")
print(f"   Estimated training time: 20-30 minutes")

# Execute training
training_start_time = time.time()

try:
    print(f"\nTraining started at: {time.strftime('%H:%M:%S')}")
    
    model, final_accuracy, conf_matrix, X_test, y_test = train_spacy_classifier_FIXED(
        df, 
        max_samples_per_class=MAX_SAMPLES_PER_CLASS
    )
    
    total_training_time = time.time() - training_start_time
    
    print(f"\nSUCCESS! Training completed!")
    print(f"=" * 40)
    print(f"Total training time: {total_training_time/60:.1f} minutes")
    print(f"Final accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
    print(f"Model is ready for use!")
    
    # Save the model
    model_save_path = "./spacy_ai_detector_WORKING"
    model.to_disk(model_save_path)
    print(f"Model saved to: {model_save_path}")
    
    # Mark training as successful
    TRAINING_SUCCESSFUL = True
    TRAINED_MODEL = model
    
except Exception as e:
    print(f"\nTraining failed with error: {e}")
    print(f"Error details:")
    import traceback
    traceback.print_exc()
    TRAINING_SUCCESSFUL = False
    TRAINED_MODEL = None

print(f"\n🏁 Training process completed at: {time.strftime('%H:%M:%S')}")

Training Configuration:
   Max samples per class: 74,789
   Total training samples: ~149,578
   Estimated training time: 20-30 minutes

Training started at: 16:32:37
SpaCy Training Started
Training Configuration:
   Total samples: 149,578
   Human samples: 74,789
   AI samples: 74,789
   Perfect balance: True
   Training samples: 119,662
   Testing samples: 29,916
   Train balance - Human: 59831, AI: 59831
   Test balance - Human: 14958, AI: 14958

 Creating SpaCy model...
   • Architecture: TextCatBOW.v3 with exclusive classification
   • Features: Bigrams (ngram_size=2)
   • Labels: HUMAN, AI

Preparing training examples...
   • Prepared 1,000 examples...
   • Prepared 2,000 examples...
   • Prepared 3,000 examples...
   • Prepared 4,000 examples...
   • Prepared 5,000 examples...
   • Prepared 6,000 examples...
   • Prepared 7,000 examples...
   • Prepared 8,000 examples...
   • Prepared 9,000 examples...
   • Prepared 10,000 examples...
   • Prepared 11,000 examples...
   • Prepare

[2025-08-13 16:35:34,718] [INFO] Created vocabulary
[2025-08-13 16:35:34,719] [INFO] Finished initializing nlp object


119,662 training examples prepared

Initializing model...
   Model initialized successfully

Starting training loop...
   • Epochs: 10
   • Batch size: 32
   • Dropout rate: 0.3
   • Learning rate: 0.001
   📊 Epoch  1: Loss=12.3563, Time=63.0s
   📊 Epoch  2: Loss=3.5135, Accuracy=0.9950 (99.5%), Time=69.4s
   📊 Epoch  3: Loss=2.7931, Time=75.9s
   📊 Epoch  4: Loss=2.3963, Accuracy=1.0000 (100.0%), Time=81.2s
   📊 Epoch  5: Loss=2.0810, Time=84.8s
   📊 Epoch  6: Loss=1.9643, Accuracy=1.0000 (100.0%), Time=89.6s
   📊 Epoch  7: Loss=1.8487, Time=93.3s
   📊 Epoch  8: Loss=1.6160, Accuracy=1.0000 (100.0%), Time=95.9s
   📊 Epoch  9: Loss=1.8500, Time=102.1s
   📊 Epoch 10: Loss=1.7227, Accuracy=1.0000 (100.0%), Time=101.1s
Training completed successfully!

Final Model Evaluation...
   • Running full test set evaluation...


Evaluating model:   0%|          | 0/29916 [00:00<?, ?it/s]


FINAL EVALUATION RESULTS:
Overall Accuracy: 0.9987 (99.87%)
Test Samples: 29,916

Detailed Classification Report:
              precision    recall  f1-score   support

       Human     0.9982    0.9992    0.9987     14958
          AI     0.9992    0.9982    0.9987     14958

    accuracy                         0.9987     29916
   macro avg     0.9987    0.9987    0.9987     29916
weighted avg     0.9987    0.9987    0.9987     29916


Confusion Matrix:
                 Predicted
Actual       Human      AI
Human        14946        12
AI              27     14931

📈 Confidence Statistics:
   • Mean AI confidence: 0.4995
   • Mean Human confidence: 0.5005
   • Std AI confidence: 0.4998
   • Std Human confidence: 0.4998

SUCCESS! Training completed!
Total training time: 18.1 minutes
Final accuracy: 0.9987 (99.87%)
Model is ready for use!
Model saved to: ./spacy_ai_detector_WORKING

🏁 Training process completed at: 16:50:43


In [6]:


if TRAINING_SUCCESSFUL and TRAINED_MODEL is not None:
    print("🎯 Running sample predictions to verify model works correctly...")
    
    # Test with diverse sample texts
    test_texts = [
        # Human-like texts
        "I love going to the beach with my family during summer vacation. The sand feels warm between my toes and the waves are so relaxing.",
        "Yesterday I had the worst day at work. My boss was being unreasonable and I spilled coffee on my shirt. Can't wait for the weekend!",
        "My grandmother makes the best apple pie in the world. She uses a secret ingredient that she won't tell anyone, not even me.",
        
        # AI-like texts  
        "The implementation of machine learning algorithms requires careful consideration of hyperparameters and model architecture to achieve optimal performance metrics.",
        "According to recent studies, artificial intelligence has demonstrated significant advancements in natural language processing tasks, showing remarkable capabilities in text generation and comprehension.",
        "The utilization of deep learning frameworks enables efficient processing of large-scale datasets, facilitating the development of robust predictive models across various domains.",
        
        # Edge cases
        "This is a simple test sentence.",
        "The quick brown fox jumps over the lazy dog and runs through the forest quickly."
    ]
    
    print(f"\n🔍 Sample Predictions:")
    print("-" * 80)
    
    for i, text in enumerate(test_texts, 1):
        doc = TRAINED_MODEL(text)
        
        ai_confidence = doc.cats["AI"]
        human_confidence = doc.cats["HUMAN"]
        prediction = "AI" if ai_confidence > human_confidence else "Human"
        confidence = max(ai_confidence, human_confidence)
        
        print(f"Test {i:2d}: {prediction:5s} (conf: {confidence:.3f}) | {text[:70]}{'...' if len(text) > 70 else ''}")
    
    print(f"\nModel testing completed successfully!")
    print(f"The model is making predictions for both AI and Human classes")
    print(f"Final accuracy on test set: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
    
else:
    print("Cannot test model - training was not successful")
    print("Please re-run the training cell to fix any issues")

print(f"\nSpaCy AI text detector is ready!")
print("=" * 60)

🎯 Running sample predictions to verify model works correctly...

🔍 Sample Predictions:
--------------------------------------------------------------------------------
Test  1: Human (conf: 0.571) | I love going to the beach with my family during summer vacation. The s...
Test  2: Human (conf: 0.740) | Yesterday I had the worst day at work. My boss was being unreasonable ...
Test  3: Human (conf: 0.908) | My grandmother makes the best apple pie in the world. She uses a secre...
Test  4: AI    (conf: 0.962) | The implementation of machine learning algorithms requires careful con...
Test  5: AI    (conf: 0.992) | According to recent studies, artificial intelligence has demonstrated ...
Test  6: AI    (conf: 0.934) | The utilization of deep learning frameworks enables efficient processi...
Test  7: AI    (conf: 0.543) | This is a simple test sentence.
Test  8: AI    (conf: 0.747) | The quick brown fox jumps over the lazy dog and runs through the fores...

Model testing completed successfu

In [7]:
!zip -r output.zip ./

  adding: spacy_ai_detector_WORKING/ (stored 0%)
  adding: spacy_ai_detector_WORKING/config.cfg (deflated 60%)
  adding: spacy_ai_detector_WORKING/vocab/ (stored 0%)
  adding: spacy_ai_detector_WORKING/vocab/key2row (stored 0%)
  adding: spacy_ai_detector_WORKING/vocab/strings.json (deflated 80%)
  adding: spacy_ai_detector_WORKING/vocab/vectors.cfg (stored 0%)
  adding: spacy_ai_detector_WORKING/vocab/lookups.bin (stored 0%)
  adding: spacy_ai_detector_WORKING/vocab/vectors (deflated 45%)
  adding: spacy_ai_detector_WORKING/tokenizer (deflated 81%)
  adding: spacy_ai_detector_WORKING/textcat/ (stored 0%)
  adding: spacy_ai_detector_WORKING/textcat/cfg (deflated 17%)
  adding: spacy_ai_detector_WORKING/textcat/model (deflated 7%)
  adding: spacy_ai_detector_WORKING/meta.json (deflated 49%)
  adding: .virtual_documents/ (stored 0%)
