# An√°lise Simples de Coment√°rios

Objetivo: Descobrir quais amenities os usu√°rios realmente mencionam e se falam bem ou mal delas.

In [None]:
import pandas as pd
import json
from anthropic import Anthropic
import os
import re

In [24]:
# Setup Claude API
from dotenv import load_dotenv
load_dotenv()  # Carrega .env file

client = Anthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY")
)

# Verificar se API key foi carregada
if not os.environ.get("ANTHROPIC_API_KEY"):
    print("‚ùå API Key n√£o encontrada!")
    print("Op√ß√µes:")
    print("1. Crie arquivo .env com: ANTHROPIC_API_KEY=sua-chave")
    print("2. Ou defina: export ANTHROPIC_API_KEY='sua-chave'")
    print("3. Ou cole diretamente no c√≥digo (menos seguro)")
else:
    print("‚úÖ API Key carregada com sucesso")

‚úÖ API Key carregada com sucesso


In [25]:
# Carregar dados - ESTRAT√âGIA OTIMIZADA
reviews = pd.read_csv('data/processed/reviews.csv')
print(f"Total de reviews: {len(reviews)}")

# Amostra grande mas process√°vel (respeitando limite de API)
sample_size = 5000  # Muito maior que antes
sample_reviews = reviews.sample(n=sample_size, random_state=42)
print(f"Amostra para an√°lise: {len(sample_reviews)} coment√°rios")
print(f"Estimativa de tempo: ~{sample_size/50:.1f} minutos (50 req/min limit)")

Total de reviews: 22376
Amostra para an√°lise: 5000 coment√°rios
Estimativa de tempo: ~100.0 minutos (50 req/min limit)


In [26]:
def clean_comment_for_api(comment):
    """Limpa e trunca coment√°rio para efici√™ncia m√°xima"""
    comment = re.sub(r'[\n\r\t]', ' ', str(comment))
    comment = comment.replace('"', "'")
    # Truncar para 100 caracteres (mais eficiente)
    return comment[:100]

def extract_amenities_batch(comments_batch):
    """Extrai amenities de M√öLTIPLOS coment√°rios em uma √∫nica chamada API"""
    
    # Preparar prompt com m√∫ltiplos coment√°rios
    comments_text = ""
    for i, comment in enumerate(comments_batch, 1):
        clean_comment = clean_comment_for_api(comment['comments'])
        comments_text += f"{i}. {clean_comment}\n"
    
    # Prompt ultra compacto para batch
    prompt = f"""Extract amenities from these comments:
{comments_text}

Return only valid JSON array:
[{{"comment":1,"amenity":"wifi","sentiment":"positive"}},{{"comment":1,"amenity":"kitchen","sentiment":"negative"}}]

Only extract physical amenities: wifi, kitchen, pool, AC, TV, washer, dryer, balcony, etc.
Ignore: location, host, apartment, building, neighborhood, price, service."""
    
    try:
        response = client.messages.create(
            model="claude-3-haiku-20240307",
            max_tokens=150,  # Aumentei um pouco para garantir resposta completa
            messages=[{"role": "user", "content": prompt}]
        )
        
        response_text = response.content[0].text.strip()
        
        # Tentar extrair JSON da resposta
        json_start = response_text.find('[')
        json_end = response_text.rfind(']') + 1
        
        if json_start != -1 and json_end > json_start:
            json_text = response_text[json_start:json_end]
            results = json.loads(json_text)
        else:
            print(f"N√£o foi poss√≠vel encontrar JSON v√°lido: {response_text[:100]}...")
            return []
        
        # Processar resultados
        processed_results = []
        for result in results:
            if isinstance(result, dict) and 'comment' in result and 'amenity' in result:
                comment_idx = result['comment'] - 1  # Converter para √≠ndice 0-based
                
                if 0 <= comment_idx < len(comments_batch):
                    original_comment = comments_batch[comment_idx]
                    
                    processed_results.append({
                        'item': result['amenity'],
                        'sentiment': result.get('sentiment', 'neutral'),
                        'listing_id': str(original_comment['listing_id']),
                        'review_id': str(original_comment['id'])
                    })
        
        return processed_results
        
    except json.JSONDecodeError as e:
        print(f"Erro JSON: {e}")
        print(f"Resposta recebida: {response_text[:200]}...")
        return []
    except Exception as e:
        print(f"Erro API: {e}")
        return []

# Teste com dados reais
print("Testando fun√ß√£o corrigida...")
test_batch = sample_reviews.head(2).to_dict('records')
test_result = extract_amenities_batch(test_batch)
print(f"Resultado: {len(test_result)} amenities extra√≠das")
for r in test_result:
    print(f"  - {r['item']}: {r['sentiment']}")

Testando fun√ß√£o corrigida...
Resultado: 1 amenities extra√≠das
  - kitchen: positive


In [27]:
# Processar coment√°rios em LOTES - VERS√ÉO OTIMIZADA
import time
from datetime import datetime

all_amenities = []
errors_count = 0
processed_count = 0
batch_size = 10  # 10 coment√°rios por requisi√ß√£o

# Converter para lista de dicion√°rios para batch processing
comments_list = sample_reviews.to_dict('records')

print(f"Iniciando processamento em lotes de {batch_size} coment√°rios...")
print(f"Total de lotes: {len(comments_list)//batch_size}")
print(f"Estimativa: ~{len(comments_list)//batch_size/50:.1f} minutos\n")

start_time = time.time()

# Processar em batches
for i in range(0, len(comments_list), batch_size):
    batch = comments_list[i:i+batch_size]
    batch_num = i//batch_size + 1
    
    print(f"Lote {batch_num}/{len(comments_list)//batch_size} ({len(batch)} coment√°rios)...", end=' ')
    
    try:
        # Timing preciso: 1.2 segundos entre requisi√ß√µes (50 req/min)
        if batch_num > 1:  # N√£o esperar no primeiro lote
            time.sleep(1.2)
        
        batch_amenities = extract_amenities_batch(batch)
        all_amenities.extend(batch_amenities)
        processed_count += len(batch)
        
        print(f"‚úÖ {len(batch_amenities)} amenities extra√≠das")
        
        # Checkpoint a cada 50 requisi√ß√µes (500 coment√°rios)
        if batch_num % 50 == 0:
            checkpoint_file = f'data/processed/checkpoint_batch_{batch_num}.json'
            with open(checkpoint_file, 'w', encoding='utf-8') as f:
                json.dump(all_amenities, f, ensure_ascii=False, indent=2)
            
            elapsed = time.time() - start_time
            rate = processed_count / elapsed * 60  # coment√°rios por minuto
            print(f"\nüìä Checkpoint {batch_num}: {len(all_amenities)} amenities | {rate:.0f} coment√°rios/min")
    
    except Exception as e:
        errors_count += 1
        print(f"‚ùå Erro: {e}")
        continue

# Estat√≠sticas finais
elapsed_time = time.time() - start_time
rate = processed_count / elapsed_time * 60

print(f"\n{'='*60}")
print(f"PROCESSAMENTO CONCLU√çDO!")
print(f"{'='*60}")
print(f"üìä Processados: {processed_count} coment√°rios em {elapsed_time:.1f}s")
print(f"‚ö° Taxa: {rate:.0f} coment√°rios/minuto")
print(f"üéØ Amenities extra√≠das: {len(all_amenities)}")
print(f"‚ùå Erros: {errors_count}")
print(f"‚úÖ Taxa de sucesso: {(1-errors_count/(len(comments_list)//batch_size))*100:.1f}%")

# Salvar resultados finais
try:
    with open('data/processed/amenities_from_comments.json', 'w', encoding='utf-8') as f:
        json.dump(all_amenities, f, ensure_ascii=False, indent=2)
    
    if all_amenities:
        df_amenities = pd.DataFrame(all_amenities)
        df_amenities.to_csv('data/processed/amenities_from_comments.csv', index=False, encoding='utf-8')
        print(f"üíæ Dados salvos: amenities_from_comments.csv ({len(df_amenities)} linhas)")
    
except Exception as e:
    print(f"‚ùå Erro ao salvar: {e}")

Iniciando processamento em lotes de 10 coment√°rios...
Total de lotes: 500
Estimativa: ~10.0 minutos

Lote 1/500 (10 coment√°rios)... ‚úÖ 2 amenities extra√≠das
Lote 2/500 (10 coment√°rios)... N√£o foi poss√≠vel encontrar JSON v√°lido: [
  {
    "comment": 1,
    "amenity": "apartment",
    "sentiment": "positive"
  },
  {
    "commen...
‚úÖ 0 amenities extra√≠das
Lote 3/500 (10 coment√°rios)... ‚úÖ 2 amenities extra√≠das
Lote 4/500 (10 coment√°rios)... ‚úÖ 1 amenities extra√≠das
Lote 5/500 (10 coment√°rios)... N√£o foi poss√≠vel encontrar JSON v√°lido: [
  {
    "comment": 1,
    "amenity": "restaurants",
    "sentiment": "positive"
  },
  {
    "comm...
‚úÖ 0 amenities extra√≠das
Lote 6/500 (10 coment√°rios)... ‚úÖ 5 amenities extra√≠das
Lote 7/500 (10 coment√°rios)... ‚úÖ 1 amenities extra√≠das
Lote 8/500 (10 coment√°rios)... ‚úÖ 4 amenities extra√≠das
Lote 9/500 (10 coment√°rios)... ‚úÖ 1 amenities extra√≠das
Lote 10/500 (10 coment√°rios)... N√£o foi poss√≠vel encontrar JSON v√°lid