In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

# Import libraries
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import requests
import json
import time
import re
from tqdm import tqdm
import logging
import os
from google.colab import files, drive
import warnings
import pyarrow # Import pyarrow
#warnings.filterwarnings('ignore')

In [3]:
# Cell 1: Install dependencies and setup
print("🚀 Setting up Climate Mitigation Classification Pipeline")
print("=" * 60)

# Install required packages
# Added pyarrow for reading feather files
!pip install -q requests pandas numpy tqdm groq pyarrow


# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ Setup complete!")

🚀 Setting up Climate Mitigation Classification Pipeline
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Setup complete!


In [4]:

# Cell 2: Configuration and API Setup
print("🔧 Configuration Setup")
print("-" * 25)

# Configuration
CONFIG = {
    'groq_api_key': '',  # Will be set below
    'groq_model': 'llama-3.1-8b-instant',
    'n_per_institution': 10000,  # Total sample size
    'batch_size': 50,           # Reduced for better checkpoint frequency
    'delay': 1.0,               # Increased default delay for rate limiting
    'max_retries': 5,           # Maximum retries for rate limited requests
    'retry_delay': 60,          # Base delay for retries (seconds)
    'backoff_multiplier': 2     # Exponential backoff multiplier
}
# Get Groq API key
print("🔑 Please enter your Groq API key:")
print("(Get it from: https://console.groq.com/keys)")
api_key =

if not api_key:
    print("❌ API key is required!")
    raise ValueError("API key cannot be empty")

CONFIG['groq_api_key'] = api_key
print("✅ API key configured!")

SyntaxError: invalid syntax (ipython-input-4-772916558.py, line 19)

In [None]:
# Cell 3: Mount Google Drive
print("💾 Google Drive Setup")
print("-" * 25)

try:
    drive.mount('/content/drive')
    DRIVE_PATH = '/content/drive/MyDrive/Climate Mitigation Council Sentences/'

    # Create directory if it doesn't exist
    os.makedirs(DRIVE_PATH, exist_ok=True)
    print(f"✅ Google Drive mounted: {DRIVE_PATH}")
except Exception as e:
    print(f"⚠️ Google Drive mount failed: {e}")
    DRIVE_PATH = './'  # Use local directory as fallback
    print("📁 Using local directory for file storage")

In [None]:
# Cell 4: Data Upload
print("📤 Data Upload")
print("-" * 15)

# Option 1: Upload from computer
print("Choose your data source:")
print("1. Upload CSV file from your computer")
print("2. Upload from preselected directory")
print("3. Use sample data for testing")
print("4. Load Feather file from Google Drive")
print("5. Load existing results file for reclassification")
print("6. Load checkpoint file from Google Drive to resume") # New option

data_choice = input("Enter choice (1/2/3/4/5/6): ").strip()

df = None # Initialize df
is_reclassification = False
is_resuming_from_checkpoint = False # New flag

if data_choice == "1":
    print("📁 Please upload your CSV file with political speeches...")
    uploaded = files.upload()
    if uploaded:
        filename = list(uploaded.keys())[0]
        try:
            df = pd.read_csv(filename)
            print(f"✅ Loaded {len(df)} sentences from {filename}")
        except Exception as e:
            print(f"❌ Error loading file {filename}: {e}")
    else:
        print("❌ No file uploaded.")

elif data_choice == "2":
   # filename = input("Enter CSV filename (including path if needed) in your Google Drive: ")
    try:
        df = pd.read_csv('/content/drive/MyDrive/Climate Mitigation Council Sentences/stratified_sample.csv')
        print(f"✅ Loaded {len(df)} sentences from Google Drive")
    except Exception as e:
        print(f"❌ File not found or error loading file. Please check the filename and ensure Google Drive is mounted. Error: {e}")

elif data_choice == "3":
    # Create sample data for testing
    print("🧪 Creating sample dataset for testing...")
    sample_data = {
        'current': [
            'The European Green Deal aims to reduce carbon emissions by 2030.',
            'We need to invest more in renewable energy infrastructure.',
            'The parliament voted on budget allocations for next year.',
            'Climate change mitigation requires immediate policy action.',
            'The economic situation has improved significantly.',
            'Solar and wind energy are key to carbon neutrality.',
            'Education reform is essential for our future.',
            'The EU ETS system helps reduce greenhouse gas emissions.',
            'Healthcare spending increased by 15% this quarter.',
            'Forest restoration enhances natural carbon sinks.'
        ] * 1000,  # Create 10k sample sentences
        'institution': ['European Parliament', 'European Commission'] * 5000,
        'how_many_words_in_climate_mitigation_dictionary': [2, 3, 0, 4, 0, 3, 0, 5, 0, 2] * 1000
    }
    df = pd.DataFrame(sample_data)
    print(f"✅ Created sample dataset with {len(df)} sentences")

elif data_choice == "4":
     filename = input("Enter Feather filename (including path if needed) in your Google Drive: ")
     try:
         df = pd.read_feather(f'/content/drive/MyDrive/{filename}')
         print(f"✅ Loaded {len(df)} sentences from Feather file in Google Drive")
     except Exception as e:
         print(f"❌ File not found or error loading file. Please check the filename and ensure Google Drive is mounted. Error: {e}")

elif data_choice == "5":  # Option for reclassification
    filename = input("Enter results CSV filename (including path if needed) in your Google Drive: ")
    try:
        df = pd.read_csv(f'/content/drive/MyDrive/{filename}')
        print(f"✅ Loaded {len(df)} sentences from existing results file")
        is_reclassification = True

        # Check for failed classifications
        if 'llm_explanation' in df.columns:
            failed_mask = (
                df['llm_explanation'].str.contains('Error: HTTP 429', na=False) |
                df['llm_explanation'].str.contains('Rate limit exceeded', na=False) |
                df['llm_explanation'].str.contains('Error:', na=False)
            )
            failed_count = failed_mask.sum()
            print(f"📊 Found {failed_count} failed classifications that can be retried")

            if failed_count > 0:
                print("Failed error types:")
                error_types = df[failed_mask]['llm_explanation'].value_counts()
                for error, count in error_types.head(5).items():
                    print(f"   • {error}: {count} cases")
        else:
            print("⚠️ No 'llm_explanation' column found in the file")

    except Exception as e:
        print(f"❌ File not found or error loading file. Error: {e}")

elif data_choice == "6": # New option for loading checkpoint
    filename = input("Enter checkpoint CSV filename (including path if needed) in your Google Drive: ")
    try:
        df = pd.read_csv(f'/content/drive/MyDrive/{filename}')
        print(f"✅ Loaded {len(df)} sentences from checkpoint file")
        is_resuming_from_checkpoint = True # Set the flag
        print("🔄 Resuming classification from checkpoint...")

        # Show current status from checkpoint
        if 'processing_status' in df.columns:
            status_counts = df['processing_status'].value_counts()
            print("Current processing status in checkpoint:")
            for status, count in status_counts.items():
                print(f"   {status}: {count}")
        else:
             print("⚠️ 'processing_status' column not found in checkpoint. Assuming all need processing.")


    except Exception as e:
        print(f"❌ File not found or error loading file. Error: {e}")

else:
    print("⚠️ Invalid choice. Please enter 1, 2, 3, 4, 5, or 6.")


# Verify required columns (removed 'institution' from requirement list)
required_cols = ['sentence', 'how_many_words_in_climate_mitigation_dictionary']

if df is not None:
    missing_cols = [col for col in required_cols if col not in df.columns]

    if missing_cols:
        print(f"❌ Missing required columns: {missing_cols}")
        print(f"Available columns: {list(df.columns)}")
        print("\n🚨 Please ensure your data file contains the required columns.")
        df = None
    else:
        print("✅ All required columns found!")
        print(f"📊 Dataset shape: {df.shape}")
        if 'institution' in df.columns:
             print(f"🏛️  Institutions: {df['institution'].unique()}")
else:
    print("❌ Data loading failed or cancelled or invalid choice. Cannot proceed with classification.")

In [None]:
# Cell 5: Enhanced Climate Classification Class
class ClimateMitigationClassifier:
    def __init__(self, groq_api_key: str, model: str = 'llama-3.1-8b-instant',
                 max_retries: int = 5, retry_delay: int = 60, backoff_multiplier: float = 2):
        """Initialize the classifier with Groq API and retry configuration"""
        self.groq_api_key = groq_api_key
        self.model = model
        self.max_retries = max_retries
        self.retry_delay = retry_delay
        self.backoff_multiplier = backoff_multiplier
        self.prompt_template = """**Prompt:**
Determine whether the following sentence is about **climate mitigation**.
A sentence is about *climate mitigation* if it refers to efforts, measures, technologies, or policies aimed at **reducing or preventing greenhouse gas (GHG) emissions** or **enhancing carbon sinks**, with the goal of **limiting climate change** and achieving **climate neutrality**, as targeted by the European Union.

Relevant topics include:
* Reducing emissions in sectors like energy, transport, industry, or agriculture
* Promoting renewable energy, energy efficiency, or low-carbon technologies
* Implementing EU climate policies like the European Green Deal, EU ETS, or Fit for 55
* Supporting carbon removal, sustainable land use, or natural carbon sinks

Now, analyze the sentence below and respond with either **"1"** if it is about climate mitigation in the EU policy context, or **"0"** if it is not. Provide a brief explanation for your answer.

Sentence: "{sentence}"

Response:"""

    def create_stratified_sample(self, df: pd.DataFrame,
                                 dictionary_col: str = 'how_many_words_in_climate_mitigation_dictionary',
                                 institution_col: str = 'institution',
                                 n_total_sample: int = 1000) -> pd.DataFrame:
        """Create a stratified sample, aiming for 50% sentences with dictionary terms."""
        sampled_dfs = []

        print(f"🎯 Creating stratified sample (total target: {n_total_sample} sentences)")

        if institution_col in df.columns:
            groups = df.groupby(institution_col)
            print(f"Grouping by institution ({len(df[institution_col].unique())} unique institutions)")
            n_per_institution = n_total_sample // len(df[institution_col].unique())
            print(f"Aiming for ~{n_per_institution} sentences per institution")
        else:
            groups = [(None, df)]
            print("No institution column found. Sampling from the entire dataset.")

        for group_name, group_df in groups:
            if group_name is not None:
                 print(f"\n📋 Processing group: {group_name}")

            with_terms = group_df[group_df[dictionary_col] > 0]
            without_terms = group_df[group_df[dictionary_col] == 0]

            print(f"   Available: {len(with_terms)} with terms, {len(without_terms)} without terms")

            if institution_col in df.columns:
                 n_with_terms = min(n_per_institution // 2, len(with_terms))
                 n_without_terms = min(n_per_institution // 2, len(without_terms))
                 target_group_sample = n_per_institution
            else:
                 n_with_terms = min(n_total_sample // 2, len(with_terms))
                 n_without_terms = min(n_total_sample // 2, len(without_terms))
                 target_group_sample = n_total_sample

            total_available_in_group = len(with_terms) + len(without_terms)

            if total_available_in_group < target_group_sample:
                 print(f"   ⚠️  Group has only {total_available_in_group} sentences, using all available")
                 n_with_terms = len(with_terms)
                 n_without_terms = len(without_terms)
            else:
                if n_with_terms < target_group_sample // 2:
                    n_without_terms = min(target_group_sample - n_with_terms, len(without_terms))
                elif n_without_terms < target_group_sample // 2:
                    n_with_terms = min(target_group_sample - n_without_terms, len(with_terms))

            sampled_with = with_terms.sample(n=n_with_terms, random_state=42) if n_with_terms > 0 else pd.DataFrame(columns=group_df.columns)
            sampled_without = without_terms.sample(n=n_without_terms, random_state=42) if n_without_terms > 0 else pd.DataFrame(columns=group_df.columns)

            group_sample = pd.concat([sampled_with, sampled_without], ignore_index=True)
            sampled_dfs.append(group_sample)

            if group_name is not None:
                 print(f"   ✅ Sampled {len(group_sample)} sentences for group {group_name}")
            else:
                 print(f"   ✅ Sampled {len(group_sample)} sentences from the dataset")

        final_sample = pd.concat(sampled_dfs, ignore_index=True)
        print(f"\n🎉 Total final sample size: {len(final_sample)} sentences")

        return final_sample

    def query_groq_with_retry(self, sentence: str) -> Tuple[int, str]:
        """Query Groq API with exponential backoff retry logic"""
        for attempt in range(self.max_retries + 1):
            try:
                classification, explanation = self.query_groq(sentence)

                # Check if the response indicates a rate limit error
                if "Error: HTTP 429" in explanation or "Rate limit exceeded" in explanation:
                    if attempt < self.max_retries:
                        wait_time = self.retry_delay * (self.backoff_multiplier ** attempt)
                        print(f"⚠️ Rate limit hit, waiting {wait_time:.0f}s before retry {attempt + 1}/{self.max_retries}")
                        time.sleep(wait_time)
                        continue
                    else:
                        print(f"❌ Max retries reached for rate limiting")
                        return 0, f"Error: Max retries exceeded due to rate limiting"

                # Successful response
                return classification, explanation

            except Exception as e:
                if attempt < self.max_retries:
                    wait_time = self.retry_delay * (self.backoff_multiplier ** attempt)
                    print(f"⚠️ Error occurred, waiting {wait_time:.0f}s before retry {attempt + 1}/{self.max_retries}: {str(e)}")
                    time.sleep(wait_time)
                    continue
                else:
                    return 0, f"Error: Max retries exceeded - {str(e)}"

        return 0, "Error: Unexpected retry loop exit"

    def query_groq(self, sentence: str) -> Tuple[int, str]:
        """Query Groq API for classification"""
        headers = {
            "Authorization": f"Bearer {self.groq_api_key}",
            "Content-Type": "application/json"
        }

        payload = {
            "model": self.model,
            "messages": [
                {"role": "user", "content": self.prompt_template.format(sentence=sentence)}
            ],
            "temperature": 0.1,
            "max_tokens": 150
        }

        try:
            response = requests.post(
                "https://api.groq.com/openai/v1/chat/completions",
                headers=headers,
                json=payload,
                timeout=30
            )

            if response.status_code == 200:
                result = response.json()
                generated_text = result['choices'][0]['message']['content']
                return self._parse_llm_response(generated_text)
            else:
                error_msg = f"HTTP {response.status_code}"
                if response.status_code == 401:
                    error_msg += " - Invalid API key"
                elif response.status_code == 429:
                    error_msg += " - Rate limit exceeded"
                elif response.status_code == 503:
                    error_msg += " - Service unavailable"
                return 0, f"Error: {error_msg}"

        except Exception as e:
            return 0, f"Error: {str(e)}"

    def _parse_llm_response(self, response_text: str) -> Tuple[int, str]:
        """Parse LLM response to extract binary classification"""
        response_text = response_text.strip()

        if "**1**" in response_text or response_text.startswith("1"):
            return 1, response_text
        elif "**0**" in response_text or response_text.startswith("0"):
            return 0, response_text
        elif re.search(r'\b1\b', response_text) and not re.search(r'\b0\b', response_text):
            return 1, response_text
        elif re.search(r'\b0\b', response_text) and not re.search(r'\b1\b', response_text):
            return 0, response_text
        else:
            climate_indicators = ['climate', 'emission', 'carbon', 'renewable', 'green deal', 'mitigation', 'ghg']
            if any(indicator.lower() in response_text.lower() for indicator in climate_indicators):
                return 1, response_text
            else:
                return 0, response_text

    def classify_sentences(self, df: pd.DataFrame,
                          sentence_col: str = 'sentence',
                          batch_size: int = 100,
                          delay: float = 0.3,
                          reclassify_failed: bool = False) -> pd.DataFrame:
        """Classify sentences using Groq API with improved error handling"""
        results_df = df.copy()

        # Initialize columns if they don't exist
        if 'is_climate_LLM' not in results_df.columns:
            results_df['is_climate_LLM'] = 0
        if 'llm_explanation' not in results_df.columns:
            results_df['llm_explanation'] = ""
        if 'processing_status' not in results_df.columns:
            results_df['processing_status'] = "pending"

        # Determine which sentences to process
        if reclassify_failed:
            # Only process sentences with errors or specific failed statuses
            failed_mask = (
                results_df['llm_explanation'].str.contains('Error:', na=False) |
                results_df['processing_status'].str.contains('error', na=False) |
                (results_df['processing_status'] == 'pending')
            )
            sentences_to_process = results_df[failed_mask].index.tolist()
            print(f"🔄 Reclassifying {len(sentences_to_process)} failed/pending sentences...")
        else:
            # Process all sentences
            sentences_to_process = df.index.tolist()
            print(f"🤖 Starting classification of {len(sentences_to_process)} sentences...")

        if not sentences_to_process:
            print("✅ No sentences need processing!")
            return results_df

        print(f"⏱️  Estimated time: ~{len(sentences_to_process) * delay / 60:.1f} minutes")

        # Process sentences with progress bar
        processed_count = 0
        for idx in tqdm(sentences_to_process, desc="🔍 Classifying"):
            sentence = results_df.loc[idx, sentence_col]

            # Skip if sentence is too short
            if pd.isna(sentence) or len(sentence.strip()) < 10:
                results_df.at[idx, 'processing_status'] = "skipped_short"
                continue

            try:
                # Query Groq API with retry logic
                classification, explanation = self.query_groq_with_retry(sentence)

                # Store results
                results_df.at[idx, 'is_climate_LLM'] = classification
                results_df.at[idx, 'llm_explanation'] = explanation
                results_df.at[idx, 'processing_status'] = "completed"
                processed_count += 1

                # Rate limiting
                time.sleep(delay)

                # Save checkpoint
                if processed_count % batch_size == 0:
                    save_path = DRIVE_PATH if 'DRIVE_PATH' in globals() and DRIVE_PATH is not None else './'
                    checkpoint_file = f"{save_path}checkpoint_processed_{processed_count}.csv"
                    try:
                        results_df.to_csv(checkpoint_file, index=False)
                        completed = (results_df['processing_status'] == 'completed').sum()
                        print(f"\n💾 Checkpoint saved: {completed}/{len(results_df)} total completed at {checkpoint_file}")
                    except Exception as e:
                         print(f"\n⚠️ Could not save checkpoint to {checkpoint_file}: {e}")
                         try:
                             local_checkpoint_file = f"./checkpoint_processed_{processed_count}_local.csv"
                             results_df.to_csv(local_checkpoint_file, index=False)
                             print(f"💾 Checkpoint saved locally: {local_checkpoint_file}")
                         except Exception as local_e:
                             print(f"❌ Could not save checkpoint locally either: {local_e}")

            except Exception as e:
                logger.error(f"Error processing sentence {idx}: {str(e)}")
                results_df.at[idx, 'processing_status'] = f"error: {str(e)}"
                results_df.at[idx, 'llm_explanation'] = str(e)

        print(f"🎉 Processing complete! Processed {processed_count} sentences in this run.")
        return results_df

    def generate_summary_stats(self, results_df: pd.DataFrame) -> Dict:
        """Generate summary statistics"""
        total = len(results_df)
        climate_count = (results_df['is_climate_LLM'] == 1).sum()

        # Count processing statuses
        completed_count = (results_df['processing_status'] == 'completed').sum()
        failed_count = results_df['processing_status'].str.contains('error', na=False).sum()
        pending_count = (results_df['processing_status'] == 'pending').sum()

        stats = {
            'total_sentences': total,
            'climate_mitigation_sentences': climate_count,
            'non_climate_sentences': total - climate_count,
            'climate_percentage': round(climate_count / total * 100, 2) if total > 0 else 0,
            'processing_success_rate': round(completed_count / total * 100, 2) if total > 0 else 0,
            'completed_sentences': completed_count,
            'failed_sentences': failed_count,
            'pending_sentences': pending_count
        }

        # Stats by institution (only if column exists)
        if 'institution' in results_df.columns:
            try:
                inst_stats = results_df.groupby('institution').agg({
                    'is_climate_LLM': ['count', 'sum', lambda x: round(x.mean() * 100, 2) if x.count() > 0 else 0]
                })
                inst_stats.columns = ['Total', 'Climate', 'Climate_%']
                stats['by_institution'] = inst_stats.to_dict(orient='index')
            except Exception as e:
                print(f"⚠️ Could not generate stats by institution: {e}")

        # Stats by dictionary presence
        try:
            dict_stats = results_df.groupby(
                results_df['how_many_words_in_climate_mitigation_dictionary'] > 0
            ).agg({
                'is_climate_LLM': ['count', 'sum', lambda x: round(x.mean() * 100, 2) if x.count() > 0 else 0]
            })
            dict_stats.columns = ['Total', 'Climate', 'Climate_%']
            dict_stats.index = ['Without_Dict_Terms', 'With_Dict_Terms']
            stats['by_dictionary_presence'] = dict_stats.to_dict(orient='index')
        except Exception as e:
             print(f"⚠️ Could not generate stats by dictionary presence: {e}")

        return stats

print("✅ Enhanced Climate Classification Class defined!")

✅ Enhanced Climate Classification Class defined!


In [None]:
# Cell 6: Choose Processing Mode
if df is not None:
    if is_resuming_from_checkpoint: # New check for resuming
        print("🔄 Resuming from Checkpoint")
        print("-" * 25)
        sample_df = df # Use the loaded checkpoint data directly
        reclassify_mode = True # We are effectively reclassifying/completing the remaining
        print("✅ Using loaded checkpoint data.")

        # Show current status from checkpoint
        if 'processing_status' in sample_df.columns:
            status_counts = sample_df['processing_status'].value_counts()
            print("Current processing status in checkpoint:")
            for status, count in status_counts.items():
                print(f"   {status}: {count}")
        else:
             print("⚠️ 'processing_status' column not found in checkpoint. Assuming all need processing.")


    elif is_reclassification:
        print("🔄 Reclassification Mode")
        print("-" * 25)

        # Show current status
        if 'processing_status' in df.columns:
            status_counts = df['processing_status'].value_counts()
            print("Current processing status:")
            for status, count in status_counts.items():
                print(f"   {status}: {count}")

        # Ask what to reclassify
        print("\nWhat would you like to reclassify?")
        print("1. Only failed/error sentences")
        print("2. All sentences (complete reclassification)")

        reclassify_choice = input("Enter choice (1/2): ").strip()

        if reclassify_choice == "1":
            reclassify_mode = True
            sample_df = df  # Use the full loaded dataset
            print("✅ Will reclassify only failed sentences")
        elif reclassify_choice == "2":
            reclassify_mode = False
            sample_df = df
            print("✅ Will reclassify all sentences")
        else:
            print("❌ Invalid choice. Defaulting to failed sentences only.")
            reclassify_mode = True
            sample_df = df
    else:
        print("🎯 Creating Stratified Sample")
        print("-" * 30)

        # Initialize classifier for sampling
        classifier = ClimateMitigationClassifier(
            CONFIG['groq_api_key'],
            CONFIG['groq_model'],
            CONFIG['max_retries'],
            CONFIG['retry_delay'],
            CONFIG['backoff_multiplier']
        )

        # Create stratified sample
        sample_df = classifier.create_stratified_sample(
            df,
            n_total_sample=CONFIG['n_per_institution']
        )

        print(f"📈 Sample Statistics:")
        print(f"   Total sentences: {len(sample_df)}")
        if 'how_many_words_in_climate_mitigation_dictionary' in sample_df.columns:
            print(f"   With dict terms: {(sample_df['how_many_words_in_climate_mitigation_dictionary'] > 0).sum()}")
            print(f"   Without dict terms: {(sample_df['how_many_words_in_climate_mitigation_dictionary'] == 0).sum()}")

        # Save sample for backup
        save_path = DRIVE_PATH if 'DRIVE_PATH' in globals() and DRIVE_PATH is not None else './'
        sample_file = f"{save_path}stratified_sample.csv"
        try:
            sample_df.to_csv(sample_file, index=False)
            print(f"💾 Sample saved: {sample_file}")
        except Exception as e:
            print(f"⚠️ Could not save sample file to {sample_file}: {e}")
            try:
                 local_sample_file = f"./stratified_sample_local.csv"
                 sample_df.to_csv(local_sample_file, index=False)
                 print(f"💾 Sample saved locally: {local_sample_file}")
            except Exception as local_e:
                 print(f"❌ Could not save sample file locally either: {local_e}")

        reclassify_mode = False

🔄 Resuming from Checkpoint
-------------------------
✅ Using loaded checkpoint data.
Current processing status in checkpoint:
   completed: 2200
   skipped_short: 13


In [None]:
# Cell 7: Run Classification
if 'sample_df' in locals() and sample_df is not None:
    print("🚀 Starting LLM Classification")
    print("-" * 35)

    # Initialize classifier with retry configuration
    classifier = ClimateMitigationClassifier(
        CONFIG['groq_api_key'],
        CONFIG['groq_model'],
        CONFIG['max_retries'],
        CONFIG['retry_delay'],
        CONFIG['backoff_multiplier']
    )

    # Determine sentences to process based on mode
    if is_resuming_from_checkpoint or (is_reclassification and reclassify_mode):
        # Filter for sentences that are not 'completed' when resuming or reclassifying failed
        unprocessed_mask = (
            sample_df['processing_status'] != 'completed'
        )
        sentences_to_process_df = sample_df[unprocessed_mask]
        sentences_to_process_indices = sentences_to_process_df.index.tolist()
        sentences_count = len(sentences_to_process_indices)

        if is_resuming_from_checkpoint:
            print(f"🔄 Resuming classification. Found {sentences_count} sentences not yet completed.")
        else: # Must be reclassification mode (option 5, choice 1)
             print(f"🔄 Reclassifying failed/pending sentences. Found {sentences_count} to reprocess.")


    else: # Normal classification (new data or reclassify all)
        sentences_to_process_df = sample_df
        sentences_to_process_indices = sample_df.index.tolist()
        sentences_count = len(sentences_to_process_indices)
        print(f"🤖 Starting fresh classification of {sentences_count} sentences...")


    if not sentences_count:
        print("✅ No sentences need processing!")
        results_df = sample_df # Ensure results_df is set even if no processing occurs
    else:
        print(f"About to classify {sentences_count} sentences")
        print(f"Estimated cost: FREE (Groq)")
        print(f"Estimated time: ~{sentences_count * CONFIG['delay'] / 60:.1f} minutes")
        print(f"Max retries per sentence: {CONFIG['max_retries']}")
        print(f"Base retry delay: {CONFIG['retry_delay']} seconds")

        proceed = input("Proceed with classification? (y/n): ").lower().strip()

        if proceed == 'y':
            print("🤖 Starting classification...")

            # Use the original sample_df as the base for results_df to keep all rows
            results_df = sample_df.copy()

            # Process sentences with progress bar, iterating only over selected indices
            processed_count = (results_df['processing_status'] == 'completed').sum() if 'processing_status' in results_df.columns else 0 # Start count from existing completions
            total_to_process_in_this_run = sentences_count # Total number of sentences that will enter the loop

            print(f"Starting processing loop for {total_to_process_in_this_run} sentences...")

            for idx in tqdm(sentences_to_process_indices, desc="🔍 Classifying"):
                sentence = results_df.loc[idx, 'sentence'] # Use results_df to get the sentence

                # Skip if sentence is too short (already handled in the method, but good practice)
                if pd.isna(sentence) or len(sentence.strip()) < 10:
                    results_df.at[idx, 'processing_status'] = "skipped_short"
                    continue

                try:
                    # Query Groq API with retry logic
                    classification, explanation = classifier.query_groq_with_retry(sentence)

                    # Store results
                    results_df.at[idx, 'is_climate_LLM'] = classification
                    results_df.at[idx, 'llm_explanation'] = explanation
                    results_df.at[idx, 'processing_status'] = "completed"

                    # Rate limiting
                    time.sleep(CONFIG['delay'])

                    # Save checkpoint - Use total completed count for filename
                    current_completed_count = (results_df['processing_status'] == 'completed').sum()
                    if current_completed_count % CONFIG['batch_size'] == 0 and current_completed_count > processed_count:
                         save_path = DRIVE_PATH if 'DRIVE_PATH' in globals() and DRIVE_PATH is not None else './'
                         checkpoint_file = f"{save_path}checkpoint_processed_{current_completed_count}.csv"
                         try:
                             results_df.to_csv(checkpoint_file, index=False)
                             print(f"\n💾 Checkpoint saved: {current_completed_count}/{len(results_df)} total completed at {checkpoint_file}")
                         except Exception as e:
                              print(f"\n⚠️ Could not save checkpoint to {checkpoint_file}: {e}")
                              try:
                                  local_checkpoint_file = f"./checkpoint_processed_{current_completed_count}_local.csv"
                                  results_df.to_csv(local_checkpoint_file, index=False)
                                  print(f"💾 Checkpoint saved locally: {local_checkpoint_file}")
                              except Exception as local_e:
                                  print(f"❌ Could not save checkpoint locally either: {local_e}")

                    processed_count = current_completed_count # Update processed_count after potential save

                except Exception as e:
                    logger.error(f"Error processing sentence {idx}: {str(e)}")
                    results_df.at[idx, 'processing_status'] = f"error: {str(e)}"
                    results_df.at[idx, 'llm_explanation'] = str(e)

            print(f"🎉 Processing complete! Processed {total_to_process_in_this_run} sentences in this run.")

            # Save final results
            save_path = DRIVE_PATH if 'DRIVE_PATH' in globals() and DRIVE_PATH is not None else './'
            results_file = f"{save_path}climate_classification_results.csv"
            try:
                results_df.to_csv(results_file, index=False)
                print(f"💾 Results saved: {results_file}")
            except Exception as e:
                print(f"⚠️ Could not save to drive: {e}")
                local_results_file = "./climate_classification_results_local.csv"
                results_df.to_csv(local_results_file, index=False)
                print(f"💾 Results saved locally: {local_results_file}")

        else:
            print("❌ Classification cancelled")
            # Ensure results_df is set to the initial sample_df if cancelled
            results_df = sample_df.copy()


🚀 Starting LLM Classification
-----------------------------------
🔄 Resuming classification. Found 7800 sentences not yet completed.
About to classify 7800 sentences
Estimated cost: FREE (Groq)
Estimated time: ~130.0 minutes
Max retries per sentence: 5
Base retry delay: 60 seconds
Proceed with classification? (y/n): y
🤖 Starting classification...
Starting processing loop for 7800 sentences...


🔍 Classifying:   0%|          | 39/7800 [00:34<2:50:33,  1.32s/it]

⚠️ Rate limit hit, waiting 60s before retry 1/5


🔍 Classifying:   1%|          | 49/7800 [01:47<4:24:36,  2.05s/it]

In [None]:
# Cell 8: Generate Summary Statistics
if 'results_df' in locals():
    print("📊 Generating Summary Statistics")
    print("-" * 35)

    # Generate stats
    stats = classifier.generate_summary_stats(results_df)

    # Display results
    print("🎯 CLASSIFICATION SUMMARY")
    print("=" * 40)
    print(f"Total sentences processed: {stats['total_sentences']:,}")
    print(f"Climate mitigation sentences: {stats['climate_mitigation_sentences']:,} ({stats['climate_percentage']}%)")
    print(f"Non-climate sentences: {stats['non_climate_sentences']:,}")
    print(f"Processing success rate: {stats['processing_success_rate']}%")

    # Processing status breakdown
    print(f"\n📈 PROCESSING STATUS:")
    print(f"✅ Completed: {stats['completed_sentences']:,}")
    print(f"❌ Failed: {stats['failed_sentences']:,}")
    print(f"⏳ Pending: {stats['pending_sentences']:,}")

    print("\n🏛️  BY INSTITUTION:")
    if 'by_institution' in stats:
        for institution, data in stats['by_institution']['Climate_%'].items():
            total = stats['by_institution']['Total'][institution]
            climate = stats['by_institution']['Climate'][institution]
            print(f"   {institution}: {climate}/{total} ({data}%)")

    print("\n📚 BY DICTIONARY TERMS:")
    if 'by_dictionary_presence' in stats:
        for category, data in stats['by_dictionary_presence']['Climate_%'].items():
            total = stats['by_dictionary_presence']['Total'][category]
            climate = stats['by_dictionary_presence']['Climate'][category]
            print(f"   {category}: {climate}/{total} ({data}%)")

    # Show error breakdown if there are failed sentences
    if stats['failed_sentences'] > 0:
        print(f"\n❌ ERROR BREAKDOWN:")
        error_mask = results_df['processing_status'].str.contains('error', na=False)
        if error_mask.any():
            error_types = results_df[error_mask]['llm_explanation'].value_counts()
            for error, count in error_types.head(5).items():
                print(f"   • {error}: {count} cases")

    # Save stats
    save_path = DRIVE_PATH if 'DRIVE_PATH' in globals() and DRIVE_PATH is not None else './'
    stats_file = f"{save_path}classification_summary.json"
    try:
        with open(stats_file, 'w') as f:
            json.dump(stats, f, indent=2, default=str)
        print(f"\n💾 Statistics saved: {stats_file}")
    except Exception as e:
        print(f"\n⚠️ Could not save statistics to drive: {e}")
        try:
            local_stats_file = "./classification_summary_local.json"
            with open(local_stats_file, 'w') as f:
                json.dump(stats, f, indent=2, default=str)
            print(f"💾 Statistics saved locally: {local_stats_file}")
        except Exception as local_e:
            print(f"❌ Could not save statistics locally either: {local_e}")

In [None]:
# Cell 9: Sample Results Preview
if 'results_df' in locals():
    print("\n🔍 Sample Results Preview")
    print("-" * 30)

    # Show some climate mitigation examples
    climate_examples = results_df[results_df['is_climate_LLM'] == 1].head(3)
    non_climate_examples = results_df[results_df['is_climate_LLM'] == 0].head(3)

    if len(climate_examples) > 0:
        print("✅ Climate Mitigation Examples:")
        for idx, row in climate_examples.iterrows():
            print(f"   • {row['current'][:80]}...")
            if len(row['llm_explanation']) > 60:
                print(f"     → {row['llm_explanation'][:60]}...\n")
            else:
                print(f"     → {row['llm_explanation']}\n")

    if len(non_climate_examples) > 0:
        print("❌ Non-Climate Examples:")
        for idx, row in non_climate_examples.iterrows():
            print(f"   • {row['current'][:80]}...")
            if len(row['llm_explanation']) > 60:
                print(f"     → {row['llm_explanation'][:60]}...\n")
            else:
                print(f"     → {row['llm_explanation']}\n")

    # Show failed examples if any
    failed_examples = results_df[results_df['processing_status'].str.contains('error', na=False)].head(3)
    if len(failed_examples) > 0:
        print("⚠️ Failed Classification Examples:")
        for idx, row in failed_examples.iterrows():
            print(f"   • {row['current'][:80]}...")
            print(f"     → Status: {row['processing_status']}")
            print(f"     → Error: {row['llm_explanation'][:60]}...\n")

In [None]:
# Cell 10: Retry Failed Classifications
if 'results_df' in locals():
    print("🔄 Retry Failed Classifications")
    print("-" * 35)

    # Check for failed classifications
    failed_mask = (
        results_df['llm_explanation'].str.contains('Error:', na=False) |
        results_df['processing_status'].str.contains('error', na=False) |
        (results_df['processing_status'] == 'pending')
    )
    failed_count = failed_mask.sum()

    if failed_count > 0:
        print(f"Found {failed_count} failed/pending classifications")

        # Show error types
        print("Error breakdown:")
        error_types = results_df[failed_mask]['llm_explanation'].value_counts()
        for error, count in error_types.head(10).items():
            print(f"   • {error}: {count} cases")

        retry_choice = input(f"\nRetry these {failed_count} failed classifications? (y/n): ").lower().strip()

        if retry_choice == 'y':
            print("🔄 Starting retry process...")

            # Ask for modified retry settings
            print(f"Current retry settings:")
            print(f"   Max retries: {CONFIG['max_retries']}")
            print(f"   Base delay: {CONFIG['retry_delay']} seconds")
            print(f"   Request delay: {CONFIG['delay']} seconds")

            modify_settings = input("Modify retry settings? (y/n): ").lower().strip()

            if modify_settings == 'y':
                try:
                    new_delay = float(input(f"Request delay ({CONFIG['delay']}): ") or CONFIG['delay'])
                    new_retry_delay = int(input(f"Base retry delay ({CONFIG['retry_delay']}): ") or CONFIG['retry_delay'])
                    new_max_retries = int(input(f"Max retries ({CONFIG['max_retries']}): ") or CONFIG['max_retries'])

                    CONFIG['delay'] = new_delay
                    CONFIG['retry_delay'] = new_retry_delay
                    CONFIG['max_retries'] = new_max_retries

                    print(f"✅ Updated settings: delay={new_delay}s, retry_delay={new_retry_delay}s, max_retries={new_max_retries}")
                except ValueError:
                    print("⚠️ Invalid input, using current settings")

            # Create new classifier with updated settings
            retry_classifier = ClimateMitigationClassifier(
                CONFIG['groq_api_key'],
                CONFIG['groq_model'],
                CONFIG['max_retries'],
                CONFIG['retry_delay'],
                CONFIG['backoff_multiplier']
            )

            # Retry failed classifications
            retry_results_df = retry_classifier.classify_sentences(
                results_df,
                sentence_col='current',
                batch_size=CONFIG['batch_size'],
                delay=CONFIG['delay'],
                reclassify_failed=True
            )

            # Update results
            results_df = retry_results_df

            # Show retry results
            new_failed_count = (
                results_df['llm_explanation'].str.contains('Error:', na=False) |
                results_df['processing_status'].str.contains('error', na=False) |
                (results_df['processing_status'] == 'pending')
            ).sum()

            print(f"\n🎉 Retry complete!")
            print(f"   Previous failed: {failed_count}")
            print(f"   Still failed: {new_failed_count}")
            print(f"   Successfully retried: {failed_count - new_failed_count}")

            # Save updated results
            save_path = DRIVE_PATH if 'DRIVE_PATH' in globals() and DRIVE_PATH is not None else './'
            retry_results_file = f"{save_path}climate_classification_results_retried.csv"
            try:
                results_df.to_csv(retry_results_file, index=False)
                print(f"💾 Updated results saved: {retry_results_file}")
            except Exception as e:
                print(f"⚠️ Could not save to drive: {e}")
                local_retry_file = "./climate_classification_results_retried_local.csv"
                results_df.to_csv(local_retry_file, index=False)
                print(f"💾 Updated results saved locally: {local_retry_file}")

        else:
            print("❌ Retry cancelled")
    else:
        print("✅ No failed classifications found!")

In [None]:
# Cell 11: Download Results
print("\n📥 Download Results")
print("-" * 20)

if 'results_df' in locals():
    print("Your results are ready for download!")
    print("Files available:")
    print("1. 📊 climate_classification_results.csv - Full results with classifications")
    print("2. 📈 classification_summary.json - Summary statistics")
    print("3. 🎯 stratified_sample.csv - Original sample used (if applicable)")

    # Show final statistics
    completed_count = (results_df['processing_status'] == 'completed').sum()
    total_count = len(results_df)
    success_rate = round(completed_count / total_count * 100, 2) if total_count > 0 else 0

    print(f"\n📊 Final Statistics:")
    print(f"   Total sentences: {total_count:,}")
    print(f"   Successfully processed: {completed_count:,} ({success_rate}%)")
    print(f"   Climate mitigation: {(results_df['is_climate_LLM'] == 1).sum():,}")

    # Option to download directly
    download_choice = input("\nDownload results now? (y/n): ").lower().strip()

    if download_choice == 'y':
        save_path = DRIVE_PATH if 'DRIVE_PATH' in globals() and DRIVE_PATH is not None else './'

        # Download main results
        try:
            files.download(f"{save_path}climate_classification_results.csv")
            print("✅ Results downloaded!")
        except:
            try:
                files.download("./climate_classification_results_local.csv")
                print("✅ Local results downloaded!")
            except Exception as e:
                print(f"❌ Could not download results: {e}")

        # Option to download summary
        download_summary = input("Download summary stats? (y/n): ").lower().strip()
        if download_summary == 'y':
            try:
                files.download(f"{save_path}classification_summary.json")
                print("✅ Summary downloaded!")
            except:
                try:
                    files.download("./classification_summary_local.json")
                    print("✅ Local summary downloaded!")
                except Exception as e:
                    print(f"❌ Could not download summary: {e}")

    print(f"\n📁 All files are also saved in your Google Drive: {DRIVE_PATH}")
else:
    print("❌ No results available. Please run the classification first.")

print("\n🎉 Climate Mitigation Classification Pipeline Complete!")
print("=" * 60)

In [None]:
# Cell 12: Quick Status Check Function
def check_classification_status(df):
    """Quick function to check the status of classifications"""
    if df is None:
        print("❌ No data loaded")
        return

    print("📊 Classification Status Check")
    print("-" * 30)

    # Basic counts
    total = len(df)
    print(f"Total sentences: {total:,}")

    if 'processing_status' in df.columns:
        status_counts = df['processing_status'].value_counts()
        print("\nProcessing Status:")
        for status, count in status_counts.items():
            percentage = round(count / total * 100, 2)
            print(f"   {status}: {count:,} ({percentage}%)")

    if 'is_climate_LLM' in df.columns:
        climate_count = (df['is_climate_LLM'] == 1).sum()
        print(f"\nClassification Results:")
        print(f"   Climate mitigation: {climate_count:,} ({round(climate_count/total*100, 2)}%)")
        print(f"   Non-climate: {total - climate_count:,} ({round((total-climate_count)/total*100, 2)}%)")

    if 'llm_explanation' in df.columns:
        # Check for errors
        error_mask = df['llm_explanation'].str.contains('Error:', na=False)
        error_count = error_mask.sum()
        if error_count > 0:
            print(f"\nErrors found: {error_count:,}")
            print("Top error types:")
            error_types = df[error_mask]['llm_explanation'].value_counts()
            for error, count in error_types.head(5).items():
                print(f"   • {error}: {count}")

# Usage example:
# check_classification_status(results_df)

In [None]:
# Cell 13: Randomly Sample Classified Sentences
if 'results_df' in locals() and results_df is not None:
    print("\n📋 Randomly Sampling Classified Sentences")
    print("-" * 45)

    sample_size = 1000

    if len(results_df) >= sample_size:
        random_sample_df = results_df.sample(n=sample_size, random_state=42)
        print(f"✅ Successfully sampled {sample_size} sentences from the classified results.")

        # Display the head of the sampled data
        print("\nSampled Data Head:")
        display(random_sample_df.head())

        # Optionally save the random sample
        save_path = DRIVE_PATH if 'DRIVE_PATH' in globals() and DRIVE_PATH is not None else './'
        random_sample_file = f"{save_path}random_classified_sample_{sample_size}.csv"
        try:
            random_sample_df.to_csv(random_sample_file, index=False)
            print(f"\n💾 Random sample saved: {random_sample_file}")
        except Exception as e:
            print(f"\n⚠️ Could not save random sample to drive: {e}")
            try:
                local_random_sample_file = f"./random_classified_sample_{sample_size}_local.csv"
                random_sample_df.to_csv(local_random_sample_file, index=False)
                print(f"💾 Random sample saved locally: {local_random_sample_file}")
            except Exception as local_e:
                print(f"❌ Could not save random sample locally either: {local_e}")

    else:
        print(f"⚠️ The total number of classified sentences ({len(results_df)}) is less than the requested sample size ({sample_size}). Skipping random sampling.")

else:
    print("❌ No classified results available (results_df not found). Skipping random sampling.")

AttributeError: 'NoneType' object has no attribute 'head'

In [None]:
df = pd.read_feather('/content/drive/MyDrive/Climate Mitigation Council Sentences/Council_sentences_climdict_filtered.feather')
df.head()

  cast_date_col = pd.to_datetime(column, errors="coerce")


Unnamed: 0,new_id,sentence_id,total_sentences_in_doc,sentence,id_within,filename,notes,venue,date,start_date,end_date,year,type_document,title,how_many_words_in_climate_mitigation_dictionary
0,1,1,13,The Heads of State or of Government and the Fo...,1,1961_02.pdf,,Luxembourg,,2/10/61,2/11/61,1961,Meeting statement,Conference of the Heads of State or of Governm...,0
1,1,2,13,Special links already unite the six countries ...,1,1961_02.pdf,,Luxembourg,,2/10/61,2/11/61,1961,Meeting statement,Conference of the Heads of State or of Governm...,0
2,1,3,13,"The six Governments are anx¬ious to seek, in a...",1,1961_02.pdf,,Luxembourg,,2/10/61,2/11/61,1961,Meeting statement,Conference of the Heads of State or of Governm...,0
3,1,4,13,"They will attempt, in the same spirit, to find...",1,1961_02.pdf,,Luxembourg,,2/10/61,2/11/61,1961,Meeting statement,Conference of the Heads of State or of Governm...,0
4,1,5,13,It was the purpose of the Conference to seek t...,1,1961_02.pdf,,Luxembourg,,2/10/61,2/11/61,1961,Meeting statement,Conference of the Heads of State or of Governm...,0
