# Submissao zero shot

## Utils de API 

In [178]:
import pandas as pd
import anthropic
import os
import time
import json
from tqdm import tqdm
from google import genai
from google.genai import types


client = anthropic.Anthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY")
)

# gemini-embedding-exp
client_google = genai.Client(
    api_key= os.environ.get("GEMINI_API_KEY")
    )

PROVIDER_ANTHROPIC = "anthropic"
PROVIDER_GOOGLE = "google"

def classify_texts_batched(df_input, output_csv_path, batch_size=10, model="claude-3-7-sonnet-20250219", provider="anthropic"):
    
    df = df_input.copy()
    df['Label'] = ''
    
    system_prompt = """
    You are an expert at identifying AI-generated text versus human-written text.
    
    Analyze each numbered text sample carefully and classify it as either 'Human' or 'AI' based on these criteria:
    
    Human-written text often:
    - Contains personal anecdotes or emotional nuance
    - Has natural irregularities, varying sentence structures
    - May include idioms, slang, or colloquialisms 
    - Can have slight grammatical errors or typos
    - Often has a distinctive voice or style
    
    AI-generated text often:
    - Has more uniform sentence structures
    - Uses more formal or academic language consistently
    - Organizes information very systematically
    - Rarely contains spelling errors or typos
    - May have repetitive patterns or phrasing
    
    IMPORTANT: Return your analysis as a CSV format with two columns (ID,LABEL) where classification is ONLY 'Human' or 'AI'.
    Do not include any other text in your response besides the CSV data.
    Do not add a prefix such as csv.
    Example output format:
    ID,LABEL
    1,Human
    2,AI
    3,Human
    """
    
    # Process dataframe in batches
    num_samples = len(df)
    num_batches = (num_samples + batch_size - 1) // batch_size  # Ceiling division
    
    print(f"Processing {num_samples} text samples in {num_batches} batches of size {batch_size}...")
    
    for batch_idx in tqdm(range(num_batches)):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, num_samples)
        batch_df = df.iloc[start_idx:end_idx]
        
        # Prepare the batch of texts to classify
        batch_text = ""
        for i, (_, row) in enumerate(batch_df.iterrows()):
            relative_idx = i + 1
            text = row['Text']
            batch_text += f"Text {relative_idx}: {text}\n\n"
        
        # Prepare the user message
        user_message = f"Please classify each of the following texts as either 'Human' or 'AI':\n\n{batch_text}\n\nReturn your analysis in CSV format with columns 'ID' and 'Label'."
        
        # print(f"Prompting the following message: {user_message}")

        max_retries = 3
        retry_delay = 2
        
        for attempt in range(max_retries):
            try:

                if provider == PROVIDER_ANTHROPIC:
                    response = client.messages.create(
                        model=model,
                        system=system_prompt,
                        max_tokens=1000,  
                        messages=[
                            {"role": "user", "content": user_message}
                        ]
                    )
                    
                    csv_response = response.content[0].text.strip()
                else: # google
                    
                    response = client_google.models.generate_content(
                        model= model,
                        contents= user_message,
                        config=types.GenerateContentConfig(
                            system_instruction = system_prompt #,
                           # max_output_tokens=1000,
                        )
                    )
                    # flash adds csv prefix.
                    csv_response = response.text.removeprefix("```csv\n").removesuffix("```").strip()

                try:
                    import io
                    result_df = pd.read_csv(io.StringIO(csv_response))
                    
                    for row in  range(len(result_df)):
                        relative_idx = result_df.iloc[row,0]
                        classification = result_df.iloc[row,1]
                                                
                        classification = classification.upper()

                        if classification == 'HUMAN':
                            classification = 'Human'

                        abs_idx = start_idx + relative_idx - 1
                        
                        if abs_idx < end_idx: 
                            df.at[abs_idx, 'Label'] = classification
                
                except Exception as e:
                    print(f"Error parsing CSV response: {e}")
                    print(f"Raw response: {csv_response}")

                break
                
            except Exception as e:
                print(f"Error on attempt {attempt+1}: {e}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential backoff
                else:
                    df.loc[start_idx:end_idx-1, 'Label'] = 'error'
                    print(f"Failed to classify batch after {max_retries} attempts.")
        
        # Add a small delay between batches to respect rate limits
        time.sleep(1)
    
    df = df[['ID','Label']]
    df.set_index('ID', inplace=True)
    df.to_csv(output_csv_path, index=True, sep='\t')
    print(f"Classification complete. Results saved to {output_csv_path}")
    
    # Return summary statistics
    human_count = (df['Label'] == 'Human').sum()
    ai_count = (df['Label'] == 'AI').sum()
    other_count = len(df) - human_count - ai_count
    
    print(f"Summary:\n- Human texts: {human_count}\n- AI texts: {ai_count}\n- Other/errors: {other_count}")
    
    return df

In [None]:
import pandas as pd
df_test = pd.read_csv('dataset2_disclosed_complete.csv', sep=';')
df_test['Label'] = df_test['Label'].astype(str)
df_test.loc[df_test['Label'] == 'Ai', 'Label'] = "AI"

from sklearn.metrics import classification_report, confusion_matrix

def test_model(y_pred):
    y_pred = y_pred.astype(str) 
    print(classification_report(df_test['Label'], y_pred))
    print(confusion_matrix(df_test['Label'], y_pred))

# Teste novos modelos da google

### O gemini mais pequeno 

In [184]:
test_flash_8b = classify_texts_batched(df_test, "test-flash-8b-zeroshot.csv", batch_size=10,model = "gemini-1.5-flash-8b", provider=PROVIDER_GOOGLE)
test_model(test_flash_8b['Label'])

Processing 100 text samples in 10 batches of size 10...


100%|██████████| 10/10 [00:16<00:00,  1.62s/it]

Classification complete. Results saved to test-flash-8b-zeroshot.csv
Summary:
- Human texts: 23
- AI texts: 77
- Other/errors: 0
              precision    recall  f1-score   support

          AI       0.53      0.84      0.65        49
       Human       0.65      0.29      0.41        51

    accuracy                           0.56       100
   macro avg       0.59      0.57      0.53       100
weighted avg       0.59      0.56      0.53       100

[[41  8]
 [36 15]]





### O gemini flash ( modelo mais equilibrado)

In [185]:
test_flash = classify_texts_batched(df_test, "test-flash-normal-zeroshot.csv", batch_size=10,model = "gemini-2.0-flash", provider=PROVIDER_GOOGLE)
test_model(test_flash['Label'])

Processing 100 text samples in 10 batches of size 10...


100%|██████████| 10/10 [00:16<00:00,  1.62s/it]

Classification complete. Results saved to test-flash-normal-zeroshot.csv
Summary:
- Human texts: 18
- AI texts: 82
- Other/errors: 0
              precision    recall  f1-score   support

          AI       0.59      0.98      0.73        49
       Human       0.94      0.33      0.49        51

    accuracy                           0.65       100
   macro avg       0.76      0.66      0.61       100
weighted avg       0.77      0.65      0.61       100

[[48  1]
 [34 17]]





### Gemini pro 2.5, modelo para tarefas mais complexas

In [None]:
test_gemini_pro = classify_texts_batched(df_test, "test-gemini-pro-zeroshot.csv", batch_size=10,model = "gemini-2.5-pro-exp-03-25", provider=PROVIDER_GOOGLE)
test_model(test_gemini_pro['Label'])


              precision    recall  f1-score   support

          AI       0.55      0.98      0.70        49
       Human       0.92      0.22      0.35        51

    accuracy                           0.59       100
   macro avg       0.73      0.60      0.52       100
weighted avg       0.73      0.59      0.52       100

[[48  1]
 [40 11]]


## Anthropic Test 

# Claude sonnet, o modelo mais equilibrado. Usado na submissão anterior.

In [188]:
# 6 cents cost
test_claude = classify_texts_batched(df_test, "test-claude-zeroshot.csv", batch_size=10, model = "claude-3-7-sonnet-20250219", provider=PROVIDER_ANTHROPIC)
test_model(test_claude['Label'])

Processing 100 text samples in 10 batches of size 10...


100%|██████████| 10/10 [00:24<00:00,  2.46s/it]

Classification complete. Results saved to test-claude-zeroshot.csv
Summary:
- Human texts: 42
- AI texts: 58
- Other/errors: 0
              precision    recall  f1-score   support

          AI       0.81      0.96      0.88        49
       Human       0.95      0.78      0.86        51

    accuracy                           0.87       100
   macro avg       0.88      0.87      0.87       100
weighted avg       0.88      0.87      0.87       100

[[47  2]
 [11 40]]





### Modelo mais eficiente da anthropic

In [None]:
test_haiku = classify_texts_batched(df_test, "test-haiku-zeroshot.csv", batch_size=10, model = "claude-3-haiku-20240307", provider=PROVIDER_ANTHROPIC)
test_model(test_haiku['Label'])

              precision    recall  f1-score   support

          AI       0.49      0.37      0.42        49
       Human       0.51      0.63      0.56        51

    accuracy                           0.50       100
   macro avg       0.50      0.50      0.49       100
weighted avg       0.50      0.50      0.49       100

[[18 31]
 [19 32]]


### Modelo mais complexo da anthropic 

O opus foi testado anteriormente e teve resultados signficativamente piores que o claude-sonnet. Quase 30% de diferença na accuracy.


# Submissão do melhor modelo em zeroshot

In [195]:
import pandas as pd

df_input = pd.read_csv('submission3_inputs.csv', sep=';')

classify_texts_batched(df_input, "submissao3-grupo008-s1.csv", batch_size=10, model = "claude-3-7-sonnet-20250219", provider=PROVIDER_ANTHROPIC)

Processing 100 text samples in 10 batches of size 10...


100%|██████████| 10/10 [00:26<00:00,  2.62s/it]

Classification complete. Results saved to submissao3-grupo008-s1.csv
Summary:
- Human texts: 41
- AI texts: 59
- Other/errors: 0





Unnamed: 0_level_0,Label
ID,Unnamed: 1_level_1
D3-1,Human
D3-2,AI
D3-3,AI
D3-4,Human
D3-5,Human
...,...
D3-96,AI
D3-97,Human
D3-98,AI
D3-99,AI
