### **1. Bibliotecas**

In [1]:
import os
import json
import numpy as np
import pandas as pd
from scipy import stats
from joblib import dump, load
from openai import OpenAI, AsyncOpenAI
import nest_asyncio
import asyncio
nest_asyncio.apply()
from tqdm.notebook import tqdm
tqdm.pandas()
from warnings import filterwarnings
filterwarnings('ignore')

---
### **2. Dataset**

In [2]:
url = os.getenv("URL_DATASET")
df = pd.read_excel(url)
df.head(3)

Unnamed: 0,Text,Text_PT,Opinion(1/0),Question(1/0),Answer(1/0),Sentiment(1-7),Confusion(1-7),Urgency(1-7),CourseType,forum_post_id,course_display_name,forum_uid,created_at,post_type,anonymous,anonymous_to_peers,up_count,comment_thread_id,reads
0,Interesting! How often we say those things to ...,Interessante! Quantas vezes dizemos essas cois...,1,0,0,6.5,2.0,1.5,Education,5225177f2c501f0a00000015,Education/EDUC115N/How_to_Learn_Math,30CADB93E6DE4711193D7BD05F2AE95C,2013-09-02 22:55:59,Comment,0.0,0.0,0.0,5221a8262cfae31200000001,41.0
1,"What is \Algebra as a Math Game\"" or are you j...",O que é \ Álgebra como um jogo de matemática \...,0,1,0,4.0,5.0,3.5,Education,5207d0e9935dfc0e0000005e,Education/EDUC115N/How_to_Learn_Math,37D8FAEE7D0B94B6CFC57D98FD3D0BA5,2013-08-11 17:59:05,Comment,0.0,0.0,0.0,520663839df35b0a00000043,55.0
2,I like the idea of my kids principal who says ...,Gosto da ideia do diretor dos meus filhos que ...,1,0,0,5.5,3.0,2.5,Education,52052c82d01fec0a00000071,Education/EDUC115N/How_to_Learn_Math,CC11480215042B3EB6E5905EAB13B733,2013-08-09 17:53:06,Comment,0.0,0.0,0.0,51e59415e339d716000001a6,25.0


In [9]:
dump(df, "data/df_stanford.z")

['data/df_stanford.z']

---
### **3. Dataframe salvo**

In [2]:
df = load('data/df_stanford.z')
df.head(2)

Unnamed: 0,Text,Text_PT,Opinion(1/0),Question(1/0),Answer(1/0),Sentiment(1-7),Confusion(1-7),Urgency(1-7),CourseType,forum_post_id,course_display_name,forum_uid,created_at,post_type,anonymous,anonymous_to_peers,up_count,comment_thread_id,reads
0,Interesting! How often we say those things to ...,Interessante! Quantas vezes dizemos essas cois...,1,0,0,6.5,2.0,1.5,Education,5225177f2c501f0a00000015,Education/EDUC115N/How_to_Learn_Math,30CADB93E6DE4711193D7BD05F2AE95C,2013-09-02 22:55:59,Comment,0.0,0.0,0.0,5221a8262cfae31200000001,41.0
1,"What is \Algebra as a Math Game\"" or are you j...",O que é \ Álgebra como um jogo de matemática \...,0,1,0,4.0,5.0,3.5,Education,5207d0e9935dfc0e0000005e,Education/EDUC115N/How_to_Learn_Math,37D8FAEE7D0B94B6CFC57D98FD3D0BA5,2013-08-11 17:59:05,Comment,0.0,0.0,0.0,520663839df35b0a00000043,55.0


---
### **4. Rótulos de Sentimentos**

In [4]:
np.sort(df['Sentiment(1-7)'].unique())

array([1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ])

---
### **5. Rotulando um post com GPT**

In [5]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def gpt_label(text):
    try:
        response = client.chat.completions.create(
            # model="gpt-3.5-turbo",
            # model="gpt-4o",
            model="gpt-4o-mini",
            max_tokens=3,
            store=True,
            messages=[
                {
                    "role": "user",
                    "content": f"You are a data scientist who labels texts regarding sentiment with one of the options [1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ], with 1 being very negative sentiment and up to 7 being very positive sentiment. Answer only one number and label the following text: {text}",
                },
            ],
        )
        return response.choices[0].message
    except Exception as e:
        print(f'Erro ao rotular: {str(e)}')
        return 0

In [6]:
indice_escolhido = 0
rotulo = float(gpt_label(df["Text"][indice_escolhido]).content)
print(f'Rótulo escolhido pelo GPT: {rotulo}')
print('-'*50)
print('Post:')
print(df[["Text", "Sentiment(1-7)"]].iloc[indice_escolhido]['Text'])
print('Rótulo real: ', df[["Text", "Sentiment(1-7)"]].iloc[indice_escolhido]["Sentiment(1-7)"])

Rótulo escolhido pelo GPT: 6.5
--------------------------------------------------
Post:
Interesting! How often we say those things to others without really understanding what we are saying. That must have been a powerful experience! Excellent!
Rótulo real:  6.5


In [8]:
df[["Text", "Sentiment(1-7)"]]

Unnamed: 0,Text,Sentiment(1-7)
0,Interesting! How often we say those things to ...,6.5
1,"What is \Algebra as a Math Game\"" or are you j...",4.0
2,I like the idea of my kids principal who says ...,5.5
3,"From their responses, it seems the students re...",6.0
4,"The boys loved math, because \there is freedom...",7.0
...,...,...
29599,The p value tells us the probability of observ...,4.0
29600,given the null hypothesis is considered true,4.0
29601,"> Hello Josh,_x0007__x0007_Is this hypothesis ...",4.0
29602,"Hi Josh,_x0007__x0007__x0007__x0007_Looking at...",3.5


---
### **6. Rotulando dataset completo com GPT**

In [9]:
client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

async def gpt_label_async(text):
    try:
        response = await client.chat.completions.create(    
            # model="gpt-3.5-turbo",
            # model="gpt-4o",
            model="gpt-4o-mini",
            max_tokens=3,
            store=True,
            messages=[
                {
                    "role": "user",
                    "content": f"You are a data scientist who labels texts regarding sentiment with one of the options [1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ], with 1 being very negative sentiment and up to 7 being very positive sentiment. Answer only one number and label the following text: {text}",
                },
            ],
        )
        return response.choices[0].message
    except Exception as e:
        print(f'Erro ao rotular: {str(e)}')
        return 0

In [None]:
intervalo = range(0, 10)
rotulos = df["Text"][intervalo].progress_apply(lambda x: asyncio.run(gpt_label_async(x)))
rotulos = rotulos.map(lambda x: float(x.content))

  0%|          | 0/10 [00:00<?, ?it/s]

In [19]:
# Realizar o teste t de Student
t_stat, p_value = stats.ttest_ind(df["Sentiment(1-7)"][intervalo], rotulos)

print(f"Estatística t: {t_stat}")
print(f"Valor p: {p_value}")

# Verificar a significância
print(f"{'Há' if p_value < 0.05 else 'Não há'} uma diferença significativa entre os rótulos reais e preditos.")

Estatística t: 0.30738931174713624
Valor p: 0.7620751630156286
Não há uma diferença significativa entre os rótulos reais e preditos.


In [None]:
dump(rotulos, 'data/rotulos.z')

### **7. Rotulando dataset completo com Batch API GPT**

In [3]:
# criando lista para arquivo jsonl com os prompts para cada post
prompts = []
def create_jsonl(idx, text):
    prompts.append(
        {
            "custom_id": f"request-{idx}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "messages": [
                    {
                        "role": "user",
                        "content": f"You are a data scientist who labels texts regarding sentiment with one of the options [1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ], with 1 being very negative sentiment and up to 7 being very positive sentiment. Answer only one number and label the following text: {text}",
                    },
                ],
                "max_tokens": 3,
            },
        }
    )

intervalo, arquivo = range(0, 10000), "data/prompts_intervalo1.jsonl"
# intervalo, arquivo = range(10000, 15000), "data/prompts_intervalo2.jsonl"
# intervalo, arquivo = range(15000, 20000), "data/prompts_intervalo3.jsonl"
# intervalo, arquivo = range(20000, 25000), "data/prompts_intervalo4.jsonl"
# intervalo, arquivo = range(25000, 29604), "data/prompts_intervalo5.jsonl"

for idx, text in zip(intervalo,df['Text'][intervalo]):
    create_jsonl(idx, text)

In [4]:
prompts

[{'custom_id': 'request-0',
  'method': 'POST',
  'url': '/v1/chat/completions',
  'body': {'model': 'gpt-4o-mini',
   'messages': [{'role': 'user',
     'content': 'You are a data scientist who labels texts regarding sentiment with one of the options [1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ], with 1 being very negative sentiment and up to 7 being very positive sentiment. Answer only one number and label the following text: Interesting! How often we say those things to others without really understanding what we are saying. That must have been a powerful experience! Excellent!'}],
   'max_tokens': 3}},
 {'custom_id': 'request-1',
  'method': 'POST',
  'url': '/v1/chat/completions',
  'body': {'model': 'gpt-4o-mini',
   'messages': [{'role': 'user',
     'content': 'You are a data scientist who labels texts regarding sentiment with one of the options [1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ], with 1 being very negative sentiment and up to 

In [5]:
# salvando arquivo jsonl com os prompts
with open(arquivo, "w", encoding="utf-8") as file:
    for entry in prompts:
        file.write(json.dumps(entry) + "\n")

In [6]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

batch_input_file = client.files.create(
    file=open(arquivo, "rb"), purpose="batch"
)

print(batch_input_file)

FileObject(id='file-LdFKkf22LYB35fE4diDoYq', bytes=8381566, created_at=1743812099, filename='prompts_intervalo1.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


In [7]:
batch_input_file_id = batch_input_file.id
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": f"gpt label dataset {arquivo}"},
)

Batch(id='batch_67f0760703908190bfda1a0792bb77b8', completion_window='24h', created_at=1743812103, endpoint='/v1/chat/completions', input_file_id='file-LdFKkf22LYB35fE4diDoYq', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1743898503, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'gpt label dataset data/prompts_intervalo1.jsonl'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))