# Submissao few shots

## Utils da pesquisa semântica

In [16]:
import numpy as np
import pandas as pd
import os

def load_dataset(dataset_version ):
    base_path = f'../datasets/{dataset_version}/'
    train_file = f'Dataset{dataset_version}_train_clean.csv'
    test_file = f'Dataset{dataset_version}_test_clean.csv'
    validation_file = f'Dataset{dataset_version}_validation_clean.csv'
    
    train = pd.read_csv(os.path.join(base_path, train_file), sep=',')
    test = pd.read_csv(os.path.join(base_path, test_file), sep=',')
    

    # Verifica se o arquivo de validação existe
    validation_path = os.path.join(base_path, validation_file)
    if os.path.exists(validation_path):
        validation = pd.read_csv(validation_path, sep=',')
        df = pd.concat([train,test,validation])
    else:
        df = pd.concat([train,test])

    return df

from langchain_core.documents import Document

def fill_db(vector_store, dataset_list_id ):

    for dataset_id in dataset_list_id:
        df = load_dataset(dataset_id)
        N = len(df)
        documents = [None] * N
        print(f"Loading dataset {dataset_id}\n")
        for row in range(N):
            documents[row] = Document(page_content=df['text'].iloc[row], metadata={"label" : df['Label'].iloc[row]})

        # 4min for 20k rows
        vector_store.add_documents(documents)

# Formatar em csv os exemplos obtidos na query da db
def formatExamples(results):
    df = pd.DataFrame(columns=['Label','Text'])

    for i,doc in enumerate(results):
        df.loc[i] = [doc.metadata['label'],doc.page_content]
    
    return df


# vector_store.similarity_search(query=e1,k=2,filter={"label": "AI"})
# 

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="human_vs_ai",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

fill_db(vector_store,[3,5,6])

Loading dataset 3

Loading dataset 5

Loading dataset 6



## Utils de API 

In [22]:
import pandas as pd
import anthropic
import os
import time
import json
from tqdm import tqdm
from google import genai
from google.genai import types


client = anthropic.Anthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY")
)

# gemini-embedding-exp
client_google = genai.Client(
    api_key= os.environ.get("GEMINI_API_KEY")
    )

PROVIDER_ANTHROPIC = "anthropic"
PROVIDER_GOOGLE = "google"

def classify_texts_with_shots(vector_store,shots,shotsPerLabel,df_input, output_csv_path, batch_size=10, model="claude-3-7-sonnet-20250219", provider=PROVIDER_ANTHROPIC):
    
    df = df_input.copy()
    df['Label'] = ''
    
    system_prompt = """
    You are an expert at identifying AI-generated text versus human-written text.
    
    Analyze each numbered text sample carefully and classify it as either 'Human' or 'AI' based on these criteria:
    
    Human-written text often:
    - Contains personal anecdotes or emotional nuance
    - Has natural irregularities, varying sentence structures
    - May include idioms, slang, or colloquialisms 
    - Can have slight grammatical errors or typos
    - Often has a distinctive voice or style
    
    AI-generated text often:
    - Has more uniform sentence structures
    - Uses more formal or academic language consistently
    - Organizes information very systematically
    - Rarely contains spelling errors or typos
    - May have repetitive patterns or phrasing
    
    IMPORTANT: Return your analysis as a CSV format with two columns (ID,LABEL) where classification is ONLY 'Human' or 'AI'.
    Do not include any other text in your response besides the CSV data.
    Do not add a prefix such as csv.
    Example output format:
    ID,LABEL
    1,Human
    2,AI
    3,Human
    """
    
    # Process dataframe in batches
    num_samples = len(df)
    num_batches = (num_samples + batch_size - 1) // batch_size  # Ceiling division
    
    print(f"Processing {num_samples} text samples in {num_batches} batches of size {batch_size}...")
    
    for batch_idx in tqdm(range(num_batches)):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, num_samples)
        batch_df = df.iloc[start_idx:end_idx]
        
        examples_df = []
        # Prepare the batch of texts to classify
        batch_text = ""
        for i, (_, row) in enumerate(batch_df.iterrows()):
            relative_idx = i + 1
            text = row['Text']
            batch_text += f"Text {relative_idx}: {text}\n\n"

            if shotsPerLabel:
                results_AI = vector_store.similarity_search(query= row['Text'],k=shots,filter={"label": "AI"})
                results_HUMAN = vector_store.similarity_search(query= row['Text'],k=shots,filter={"label": "Human"})
                examples_df.append( formatExamples(results_AI) )
                examples_df.append( formatExamples(results_HUMAN) )
            else:
                results = vector_store.similarity_search(query= row['Text'],k=shots)
                examples_df.append( formatExamples(results) )

            
        
        shots_text = pd.concat(examples_df).to_csv(index=False)
        # print(f"\nShots text: {shots_text}\n\n")

        # Prepare the user message
        user_message = f"First, I am providing some examples that may be helpful in classyfing texts as either 'Human' or 'AI', the examples are in csv and are the following:\n\n {shots_text}\n\n Please classify each of the following texts as either 'Human' or 'AI':\n\n{batch_text}\n\nReturn your analysis in CSV format with columns 'ID' and 'Label'."
        
        # print(f"Prompting the following message: {user_message}")

        max_retries = 3
        retry_delay = 2
        
        for attempt in range(max_retries):
            try:

                if provider == PROVIDER_ANTHROPIC:
                    response = client.messages.create(
                        model=model,
                        system=system_prompt,
                        max_tokens=2000,  
                        messages=[
                            {"role": "user", "content": user_message}
                        ]
                    )
                    
                    csv_response = response.content[0].text.strip()
                else: # google
                    
                    response = client_google.models.generate_content(
                        model= model,
                        contents= user_message,
                        config=types.GenerateContentConfig(
                            system_instruction = system_prompt #,
                            # max_output_tokens=1000,
                        )
                    )
                    # flash adds csv prefix.
                    csv_response = response.text.removeprefix("```csv\n").removesuffix("```").strip()

                try:
                    import io
                    result_df = pd.read_csv(io.StringIO(csv_response))
                    
                    for row in  range(len(result_df)):
                        relative_idx = result_df.iloc[row,0]
                        classification = result_df.iloc[row,1]
                                                
                        classification = classification.upper()

                        if classification == 'HUMAN':
                            classification = 'Human'

                        abs_idx = start_idx + relative_idx - 1
                        
                        if abs_idx < end_idx: 
                            df.at[abs_idx, 'Label'] = classification
                
                except Exception as e:
                    print(f"Error parsing CSV response: {e}")
                    print(f"Raw response: {csv_response}")

                break
                
            except Exception as e:
                print(f"Error on attempt {attempt+1}: {e}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential backoff
                else:
                    df.loc[start_idx:end_idx-1, 'Label'] = 'error'
                    print(f"Failed to classify batch after {max_retries} attempts.")
        
        # Add a small delay between batches to respect rate limits
        time.sleep(1)
    
    df = df[['ID','Label']]
    df.set_index('ID', inplace=True)
    df.to_csv(output_csv_path, index=True, sep='\t')
    print(f"Classification complete. Results saved to {output_csv_path}")
    
    # Return summary statistics
    human_count = (df['Label'] == 'Human').sum()
    ai_count = (df['Label'] == 'AI').sum()
    other_count = len(df) - human_count - ai_count
    
    print(f"Summary:\n- Human texts: {human_count}\n- AI texts: {ai_count}\n- Other/errors: {other_count}")
    
    return df
    


    

### utils de test

In [12]:
import pandas as pd
df_test = pd.read_csv('dataset2_disclosed_complete.csv', sep=';')
df_test['Label'] = df_test['Label'].astype(str)
df_test.loc[df_test['Label'] == 'Ai', 'Label'] = "AI"

from sklearn.metrics import classification_report, confusion_matrix

def test_model(y_pred):
    y_pred = y_pred.astype(str) 
    print(classification_report(df_test['Label'], y_pred))
    print(confusion_matrix(df_test['Label'], y_pred))

# Testes para escolher o melhor numero de shots

## k = 1 , one shot

### K = 1 exemplos mais parecidos com cada row

In [None]:
test_claude_k1_m0 = classify_texts_with_shots(vector_store,1,False,df_test, "test_claude_k1_m0.csv", batch_size=10) # .12
test_model(test_claude_k1_m0['Label']) 

Processing 100 text samples in 10 batches of size 10...


100%|██████████| 10/10 [01:06<00:00,  6.68s/it]

Classification complete. Results saved to test_claude_k1_m0.csv
Summary:
- Human texts: 41
- AI texts: 59
- Other/errors: 0
              precision    recall  f1-score   support

          AI       0.80      0.96      0.87        49
       Human       0.95      0.76      0.85        51

    accuracy                           0.86       100
   macro avg       0.87      0.86      0.86       100
weighted avg       0.88      0.86      0.86       100

[[47  2]
 [12 39]]





### K=1 exemplos mais parecidos com cada row e label 

In [None]:
#def classify_texts_with_shots(vector_store,shots,shotsPerLabel,df_input, output_csv_path, batch_size=10, model="claude-3-7-sonnet-20250219", provider="anthropic"):

test_claude_k1_m2 = classify_texts_with_shots(vector_store,1,True,df_test, "test_claude_k1_m2.csv", batch_size=10) 
test_model(test_claude_k1_m2['Label']) #.20 cents

Processing 100 text samples in 10 batches of size 10...


100%|██████████| 10/10 [03:39<00:00, 21.96s/it]

Classification complete. Results saved to test_claude_k1_m2.csv
Summary:
- Human texts: 47
- AI texts: 53
- Other/errors: 0
              precision    recall  f1-score   support

          AI       0.87      0.94      0.90        49
       Human       0.94      0.86      0.90        51

    accuracy                           0.90       100
   macro avg       0.90      0.90      0.90       100
weighted avg       0.90      0.90      0.90       100

[[46  3]
 [ 7 44]]





## k = 3

### K=3 exemplos mais parecidos com cada row

In [None]:
test_claude_k3_m0 = classify_texts_with_shots(vector_store,3,False,df_test, "test_claude_k3_m0.csv", batch_size=5) # 0.30 
test_model(test_claude_k3_m0['Label']) 

Processing 100 text samples in 20 batches of size 5...


100%|██████████| 20/20 [04:27<00:00, 13.36s/it]

Classification complete. Results saved to test_claude_k3_m0.csv
Summary:
- Human texts: 39
- AI texts: 61
- Other/errors: 0
              precision    recall  f1-score   support

          AI       0.79      0.98      0.87        49
       Human       0.97      0.75      0.84        51

    accuracy                           0.86       100
   macro avg       0.88      0.86      0.86       100
weighted avg       0.88      0.86      0.86       100

[[48  1]
 [13 38]]





### K=3 exemplos mais parecidos com cada row e label 

In [None]:
test_claude_k3_m2 = classify_texts_with_shots(vector_store,3,True,df_test, "test_claude_k3_m2.csv", batch_size=5) # ,model = "claude-3-haiku-20240307"
test_model(test_claude_k3_m2['Label']) # 60cents

Processing 100 text samples in 20 batches of size 5...


100%|██████████| 20/20 [07:15<00:00, 21.78s/it]

Classification complete. Results saved to test_claude_k3_m2.csv
Summary:
- Human texts: 42
- AI texts: 58
- Other/errors: 0
              precision    recall  f1-score   support

          AI       0.81      0.96      0.88        49
       Human       0.95      0.78      0.86        51

    accuracy                           0.87       100
   macro avg       0.88      0.87      0.87       100
weighted avg       0.88      0.87      0.87       100

[[47  2]
 [11 40]]





## k = 10

# Submissão final

k = 1, o exemplo mais parecido para cada row e label

In [29]:
import pandas as pd

df_input = pd.read_csv('submission3_inputs.csv', sep=';')

classify_texts_with_shots(vector_store,1,True,df_input, "submissao3-grupo008-s2.csv", batch_size=10) 

Processing 100 text samples in 10 batches of size 10...


100%|██████████| 10/10 [02:27<00:00, 14.73s/it]

Classification complete. Results saved to submissao3-grupo008-s2.csv
Summary:
- Human texts: 42
- AI texts: 58
- Other/errors: 0





Unnamed: 0_level_0,Label
ID,Unnamed: 1_level_1
D3-1,AI
D3-2,AI
D3-3,AI
D3-4,Human
D3-5,Human
...,...
D3-96,AI
D3-97,Human
D3-98,AI
D3-99,Human
