# Submissao zero shot claude

In [7]:
import pandas as pd
import anthropic
import os
import time
import json
from tqdm import tqdm

client = anthropic.Anthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY")
)

def classify_texts_batched(df, output_csv_path, batch_size=10, model="claude-3-7-sonnet-20250219"):
    
    df['Label'] = ''
    
    system_prompt = """
    You are an expert at identifying AI-generated text versus human-written text.
    
    Analyze each numbered text sample carefully and classify it as either 'Human' or 'AI' based on these criteria:
    
    Human-written text often:
    - Contains personal anecdotes or emotional nuance
    - Has natural irregularities, varying sentence structures
    - May include idioms, slang, or colloquialisms 
    - Can have slight grammatical errors or typos
    - Often has a distinctive voice or style
    
    AI-generated text often:
    - Has more uniform sentence structures
    - Uses more formal or academic language consistently
    - Organizes information very systematically
    - Rarely contains spelling errors or typos
    - May have repetitive patterns or phrasing
    
    IMPORTANT: Return your analysis as a CSV format with two columns (ID,LABEL) where classification is ONLY 'Human' or 'AI'.
    Do not include any other text in your response besides the CSV data.
    Example output format:
    ID,LABEL
    1,human
    2,AI
    3,human
    """
    
    # Process dataframe in batches
    num_samples = len(df)
    num_batches = (num_samples + batch_size - 1) // batch_size  # Ceiling division
    
    print(f"Processing {num_samples} text samples in {num_batches} batches of size {batch_size}...")
    
    for batch_idx in tqdm(range(num_batches)):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, num_samples)
        batch_df = df.iloc[start_idx:end_idx]
        

        # Prepare the batch of texts to classify
        batch_text = ""
        for i, (_, row) in enumerate(batch_df.iterrows()):
            relative_idx = i + 1
            text = row['Text']
            batch_text += f"Text {relative_idx}: {text}\n\n"
        
        # Prepare the user message
        user_message = f"Please classify each of the following texts as either 'Human' or 'AI':\n\n{batch_text}\n\nReturn your analysis in CSV format with columns 'ID' and 'Label'."
        
        # print(f"Prompting the following message: {user_message}")

        max_retries = 3
        retry_delay = 2
        
        for attempt in range(max_retries):
            try:
                response = client.messages.create(
                    model=model,
                    system=system_prompt,
                    max_tokens=100,  
                    messages=[
                        {"role": "user", "content": user_message}
                    ]
                )
                
                csv_response = response.content[0].text.strip()
                
                # Parse the CSV response
                try:
                    import io
                    result_df = pd.read_csv(io.StringIO(csv_response))
                    
                    for row in  range(len(result_df)):
                        relative_idx = result_df.iloc[row,0]
                        classification = result_df.iloc[row,1]
                                                
                        abs_idx = start_idx + relative_idx - 1
                        
                        if abs_idx < end_idx: 
                            df.at[abs_idx, 'Label'] = classification

                    
                    print(f"Processed sucessfully batch {batch_idx}")

                
                except Exception as e:
                    print(f"Error parsing CSV response: {e}")
                    print(f"Raw response: {csv_response}")

                break
                
            except Exception as e:
                print(f"Error on attempt {attempt+1}: {e}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential backoff
                else:
                    df.loc[start_idx:end_idx-1, 'Label'] = 'error'
                    print(f"Failed to classify batch after {max_retries} attempts.")
        
        # Add a small delay between batches to respect rate limits
        time.sleep(5)
    
    df = df.drop(['Text'], axis=1)
    df.set_index('ID', inplace=True)
    df.to_csv(output_csv_path, index=True, sep='\t')
    print(f"Classification complete. Results saved to {output_csv_path}")
    
    # Return summary statistics
    human_count = (df['Label'] == 'Human').sum()
    ai_count = (df['Label'] == 'AI').sum()
    other_count = len(df) - human_count - ai_count
    
    print(f"Summary:\n- Human texts: {human_count}\n- AI texts: {ai_count}\n- Other/errors: {other_count}")
    
    return df

In [None]:
import pandas as pd

df_input = pd.read_csv('dataset3_inputs.csv', sep=';')

classify_texts_batched(df_input, "submissao2-grupo008-s1.csv", batch_size=10)

Processing 100 text samples in 10 batches of size 10...


  0%|          | 0/10 [00:00<?, ?it/s]

Processed sucessfully batch 0


 10%|█         | 1/10 [00:06<01:00,  6.69s/it]

Processed sucessfully batch 1


 20%|██        | 2/10 [00:13<00:53,  6.75s/it]

Processed sucessfully batch 2


 30%|███       | 3/10 [00:20<00:47,  6.76s/it]

Processed sucessfully batch 3


 40%|████      | 4/10 [00:26<00:38,  6.46s/it]

Processed sucessfully batch 4


 50%|█████     | 5/10 [00:33<00:32,  6.59s/it]

Processed sucessfully batch 5


 60%|██████    | 6/10 [00:39<00:26,  6.68s/it]

Processed sucessfully batch 6


 70%|███████   | 7/10 [00:46<00:19,  6.66s/it]

Processed sucessfully batch 7


 80%|████████  | 8/10 [00:53<00:13,  6.72s/it]

Processed sucessfully batch 8


 90%|█████████ | 9/10 [00:59<00:06,  6.50s/it]

Processed sucessfully batch 9


100%|██████████| 10/10 [01:06<00:00,  6.61s/it]

Classification complete. Results saved to submissao1-grupo008-s1.csv
Summary:
- Human texts: 39
- AI texts: 61
- Other/errors: 0





Unnamed: 0_level_0,Label
ID,Unnamed: 1_level_1
D3-1,AI
D3-2,AI
D3-3,AI
D3-4,Human
D3-5,Human
...,...
D2-94,AI
D2-96,Human
D2-97,AI
D2-98,Human
