### **1. Bibliotecas**

In [1]:
import os
import json
import numpy as np
import pandas as pd
from scipy import stats
from joblib import dump, load
from openai import OpenAI, AsyncOpenAI
import nest_asyncio
import asyncio
nest_asyncio.apply()
from tqdm.notebook import tqdm
tqdm.pandas()
from warnings import filterwarnings
filterwarnings('ignore')

---
### **2. Dataset**

In [2]:
url = os.getenv("URL_DATASET")
df = pd.read_excel(url)
df.head(3)

Unnamed: 0,Text,Text_PT,Opinion(1/0),Question(1/0),Answer(1/0),Sentiment(1-7),Confusion(1-7),Urgency(1-7),CourseType,forum_post_id,course_display_name,forum_uid,created_at,post_type,anonymous,anonymous_to_peers,up_count,comment_thread_id,reads
0,Interesting! How often we say those things to ...,Interessante! Quantas vezes dizemos essas cois...,1,0,0,6.5,2.0,1.5,Education,5225177f2c501f0a00000015,Education/EDUC115N/How_to_Learn_Math,30CADB93E6DE4711193D7BD05F2AE95C,2013-09-02 22:55:59,Comment,0.0,0.0,0.0,5221a8262cfae31200000001,41.0
1,"What is \Algebra as a Math Game\"" or are you j...",O que é \ Álgebra como um jogo de matemática \...,0,1,0,4.0,5.0,3.5,Education,5207d0e9935dfc0e0000005e,Education/EDUC115N/How_to_Learn_Math,37D8FAEE7D0B94B6CFC57D98FD3D0BA5,2013-08-11 17:59:05,Comment,0.0,0.0,0.0,520663839df35b0a00000043,55.0
2,I like the idea of my kids principal who says ...,Gosto da ideia do diretor dos meus filhos que ...,1,0,0,5.5,3.0,2.5,Education,52052c82d01fec0a00000071,Education/EDUC115N/How_to_Learn_Math,CC11480215042B3EB6E5905EAB13B733,2013-08-09 17:53:06,Comment,0.0,0.0,0.0,51e59415e339d716000001a6,25.0


In [218]:
df.insert(6, column='Sentiment', value=df["Sentiment(1-7)"].apply(lambda x: 0 if x < 4 else 1))
df.insert(8, column='Confusion', value=df["Confusion(1-7)"].apply(lambda x: 0 if x < 4 else 1))
df.insert(10, column='Urgency', value=df["Urgency(1-7)"].apply(lambda x: 0 if x < 4 else 1))
df.head(2)

Unnamed: 0,Text,Text_PT,Opinion(1/0),Question(1/0),Answer(1/0),Sentiment(1-7),Sentiment,Confusion(1-7),Confusion,Urgency(1-7),...,forum_post_id,course_display_name,forum_uid,created_at,post_type,anonymous,anonymous_to_peers,up_count,comment_thread_id,reads
0,Interesting! How often we say those things to ...,Interessante! Quantas vezes dizemos essas cois...,1,0,0,6.5,1,2.0,0,1.5,...,5225177f2c501f0a00000015,Education/EDUC115N/How_to_Learn_Math,30CADB93E6DE4711193D7BD05F2AE95C,2013-09-02 22:55:59,Comment,0.0,0.0,0.0,5221a8262cfae31200000001,41.0
1,"What is \Algebra as a Math Game\"" or are you j...",O que é \ Álgebra como um jogo de matemática \...,0,1,0,4.0,1,5.0,1,3.5,...,5207d0e9935dfc0e0000005e,Education/EDUC115N/How_to_Learn_Math,37D8FAEE7D0B94B6CFC57D98FD3D0BA5,2013-08-11 17:59:05,Comment,0.0,0.0,0.0,520663839df35b0a00000043,55.0


In [219]:
dump(df, "data/df_stanford.z")

['data/df_stanford.z']

---
### **3. Dataframe salvo**

In [2]:
df = load('data/df_stanford.z')
df.head(2)

Unnamed: 0,Text,Text_PT,Opinion(1/0),Question(1/0),Answer(1/0),Sentiment(1-7),Sentiment,Confusion(1-7),Confusion,Urgency(1-7),...,forum_post_id,course_display_name,forum_uid,created_at,post_type,anonymous,anonymous_to_peers,up_count,comment_thread_id,reads
0,Interesting! How often we say those things to ...,Interessante! Quantas vezes dizemos essas cois...,1,0,0,6.5,1,2.0,0,1.5,...,5225177f2c501f0a00000015,Education/EDUC115N/How_to_Learn_Math,30CADB93E6DE4711193D7BD05F2AE95C,2013-09-02 22:55:59,Comment,0.0,0.0,0.0,5221a8262cfae31200000001,41.0
1,"What is \Algebra as a Math Game\"" or are you j...",O que é \ Álgebra como um jogo de matemática \...,0,1,0,4.0,1,5.0,1,3.5,...,5207d0e9935dfc0e0000005e,Education/EDUC115N/How_to_Learn_Math,37D8FAEE7D0B94B6CFC57D98FD3D0BA5,2013-08-11 17:59:05,Comment,0.0,0.0,0.0,520663839df35b0a00000043,55.0


---
### **4. Rótulos de Sentimentos**

In [4]:
np.sort(df['Sentiment(1-7)'].unique())

array([1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ])

---
### **5. Rotulando um post com GPT**

In [5]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def gpt_label(text):
    try:
        response = client.chat.completions.create(
            # model="gpt-3.5-turbo",
            # model="gpt-4o",
            model="gpt-4o-mini",
            max_tokens=3,
            store=True,
            messages=[
                {
                    "role": "user",
                    "content": f"You are a data scientist who labels texts regarding sentiment with one of the options [1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ], with 1 being very negative sentiment and up to 7 being very positive sentiment. Answer only one number and label the following text: {text}",
                },
            ],
        )
        return response.choices[0].message
    except Exception as e:
        print(f'Erro ao rotular: {str(e)}')
        return 0

In [6]:
indice_escolhido = 0
rotulo = float(gpt_label(df["Text"][indice_escolhido]).content)
print(f'Rótulo escolhido pelo GPT: {rotulo}')
print('-'*50)
print('Post:')
print(df[["Text", "Sentiment(1-7)"]].iloc[indice_escolhido]['Text'])
print('Rótulo real: ', df[["Text", "Sentiment(1-7)"]].iloc[indice_escolhido]["Sentiment(1-7)"])

Rótulo escolhido pelo GPT: 6.5
--------------------------------------------------
Post:
Interesting! How often we say those things to others without really understanding what we are saying. That must have been a powerful experience! Excellent!
Rótulo real:  6.5


In [8]:
df[["Text", "Sentiment(1-7)"]]

Unnamed: 0,Text,Sentiment(1-7)
0,Interesting! How often we say those things to ...,6.5
1,"What is \Algebra as a Math Game\"" or are you j...",4.0
2,I like the idea of my kids principal who says ...,5.5
3,"From their responses, it seems the students re...",6.0
4,"The boys loved math, because \there is freedom...",7.0
...,...,...
29599,The p value tells us the probability of observ...,4.0
29600,given the null hypothesis is considered true,4.0
29601,"> Hello Josh,_x0007__x0007_Is this hypothesis ...",4.0
29602,"Hi Josh,_x0007__x0007__x0007__x0007_Looking at...",3.5


---
### **6. Rotulando dataset completo com GPT**

In [9]:
client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

async def gpt_label_async(text):
    try:
        response = await client.chat.completions.create(    
            # model="gpt-3.5-turbo",
            # model="gpt-4o",
            model="gpt-4o-mini",
            max_tokens=3,
            store=True,
            messages=[
                {
                    "role": "user",
                    "content": f"You are a data scientist who labels texts regarding sentiment with one of the options [1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ], with 1 being very negative sentiment and up to 7 being very positive sentiment. Answer only one number and label the following text: {text}",
                },
            ],
        )
        return response.choices[0].message
    except Exception as e:
        print(f'Erro ao rotular: {str(e)}')
        return 0

In [None]:
intervalo = range(0, 10)
rotulos = df["Text"][intervalo].progress_apply(lambda x: asyncio.run(gpt_label_async(x)))
rotulos = rotulos.map(lambda x: float(x.content))

  0%|          | 0/10 [00:00<?, ?it/s]

In [19]:
# Realizar o teste t de Student
t_stat, p_value = stats.ttest_ind(df["Sentiment(1-7)"][intervalo], rotulos)

print(f"Estatística t: {t_stat}")
print(f"Valor p: {p_value}")

# Verificar a significância
print(f"{'Há' if p_value < 0.05 else 'Não há'} uma diferença significativa entre os rótulos reais e preditos.")

Estatística t: 0.30738931174713624
Valor p: 0.7620751630156286
Não há uma diferença significativa entre os rótulos reais e preditos.


In [None]:
dump(rotulos, 'data/rotulos.z')

### **7. Rotulando dataset completo com Batch API GPT**

In [68]:
# criando lista para arquivo jsonl com os prompts para cada post
prompts = []
def create_jsonl(idx, text):
    prompts.append(
        {
            "custom_id": f"request-{idx}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "messages": [
                    {
                        "role": "user",
                        "content": f"You are a data scientist who labels texts regarding sentiment with one of the options [1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ], with 1 being very negative sentiment and up to 7 being very positive sentiment. Answer only one number and label the following text: {text}",
                    },
                ],
                "max_tokens": 3,
            },
        }
    )

# intervalo, arquivo, resultado = range(0, 10000), "data/prompts1.jsonl", "data/resultados1.jsonl"
# intervalo, arquivo, resultado = range(10000, 15000), "data/prompts2.jsonl", "data/resultados2.jsonl"
# intervalo, arquivo, resultado = range(15000, 20000), "data/prompts3.jsonl", "data/resultados3.jsonl"
# intervalo, arquivo, resultado = range(20000, 25000), "data/prompts4.jsonl", "data/resultados4.jsonl"
intervalo, arquivo, resultado = range(25000, 29604), "data/prompts5.jsonl", "data/resultados5.jsonl"

for idx, text in zip(intervalo,df['Text'][intervalo]):
    create_jsonl(idx, text)

In [69]:
# salvando arquivo jsonl com os prompts
with open(arquivo, "w", encoding="utf-8") as file:
    for entry in prompts:
        file.write(json.dumps(entry) + "\n")

In [70]:
# enviando arquivo jsonl para API
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
batch_input_file = client.files.create(
    file=open(arquivo, "rb"), purpose="batch"
)

batch_input_file.to_dict()

{'id': 'file-WcCu1nXH188WeTka4koWzf',
 'bytes': 3370173,
 'created_at': 1743988283,
 'filename': 'prompts5.jsonl',
 'object': 'file',
 'purpose': 'batch',
 'status': 'processed',
 'expires_at': None,
 'status_details': None}

In [71]:
# criando o lote do arquivo jsonl
batch = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": f"gpt label dataset {arquivo}"},
)

batch.to_dict()

{'id': 'batch_67f3264d34388190ab673981627e0dd4',
 'completion_window': '24h',
 'created_at': 1743988301,
 'endpoint': '/v1/chat/completions',
 'input_file_id': 'file-WcCu1nXH188WeTka4koWzf',
 'object': 'batch',
 'status': 'validating',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': None,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1744074701,
 'failed_at': None,
 'finalizing_at': None,
 'in_progress_at': None,
 'metadata': {'description': 'gpt label dataset data/prompts5.jsonl'},
 'output_file_id': None,
 'request_counts': {'completed': 0, 'failed': 0, 'total': 0}}

In [176]:
# listar último lote
client.batches.list(limit=1).to_dict()['data'][0]

{'id': 'batch_67f3264d34388190ab673981627e0dd4',
 'completion_window': '24h',
 'created_at': 1743988301,
 'endpoint': '/v1/chat/completions',
 'input_file_id': 'file-WcCu1nXH188WeTka4koWzf',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1743989458,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1744074701,
 'failed_at': None,
 'finalizing_at': 1743989172,
 'in_progress_at': 1743988303,
 'metadata': {'description': 'gpt label dataset data/prompts5.jsonl'},
 'output_file_id': 'file-9nPLL1DtMhHZAaDGDLA2uC',
 'request_counts': {'completed': 4604, 'failed': 0, 'total': 4604}}

In [82]:
# verificar situação do lote - outra opção se souber o id do lote
client.batches.retrieve(batch.to_dict()["id"]).to_dict()

{'id': 'batch_67f3264d34388190ab673981627e0dd4',
 'completion_window': '24h',
 'created_at': 1743988301,
 'endpoint': '/v1/chat/completions',
 'input_file_id': 'file-WcCu1nXH188WeTka4koWzf',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1743989458,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1744074701,
 'failed_at': None,
 'finalizing_at': 1743989172,
 'in_progress_at': 1743988303,
 'metadata': {'description': 'gpt label dataset data/prompts5.jsonl'},
 'output_file_id': 'file-9nPLL1DtMhHZAaDGDLA2uC',
 'request_counts': {'completed': 4604, 'failed': 0, 'total': 4604}}

In [None]:
# pegar resultado do lote
output_file_id = client.batches.retrieve(batch.to_dict()["id"]).to_dict()["output_file_id"]

file_response = client.files.content(output_file_id)
file_response.write_to_file(resultado)

### **8. Tratamento dos resultados**

In [3]:
def get_labels(caminho_arquivo):        
    dados = []
    with open(caminho_arquivo, "r", encoding="utf-8") as file:
        for linha in file:
            dados.append(json.loads(linha))

    rotulos = []
    for i in range(len(dados)):
        try:
            rotulos.append(float(dados[i]["response"]["body"]["choices"][0]["message"]["content"]))           
        except ValueError:
            rotulos.append(4.0)

    return rotulos

In [9]:
r1 = get_labels('data/resultados1.jsonl')
r2 = get_labels('data/resultados2.jsonl')
r3 = get_labels('data/resultados3.jsonl')
r4 = get_labels('data/resultados4.jsonl')
r5 = get_labels('data/resultados5.jsonl')

# labels = pd.Series(r1 + r2 + r3 + r4 + r5).apply(lambda x: 0 if x < 4 else 1)
labels = pd.Series(r1 + r2 + r3 + r4 + r5)
df.insert(6, column="Sentiment(1-7)GPT", value=labels)
df.head(1)

Unnamed: 0,Text,Text_PT,Opinion(1/0),Question(1/0),Answer(1/0),Sentiment(1-7),Sentiment(1-7)GPT,Sentiment,Confusion(1-7),Confusion,...,forum_post_id,course_display_name,forum_uid,created_at,post_type,anonymous,anonymous_to_peers,up_count,comment_thread_id,reads
0,Interesting! How often we say those things to ...,Interessante! Quantas vezes dizemos essas cois...,1,0,0,6.5,6.5,1,2.0,0,...,5225177f2c501f0a00000015,Education/EDUC115N/How_to_Learn_Math,30CADB93E6DE4711193D7BD05F2AE95C,2013-09-02 22:55:59,Comment,0.0,0.0,0.0,5221a8262cfae31200000001,41.0


In [10]:
df.columns

Index(['Text', 'Text_PT', 'Opinion(1/0)', 'Question(1/0)', 'Answer(1/0)',
       'Sentiment(1-7)', 'Sentiment(1-7)GPT', 'Sentiment', 'Confusion(1-7)',
       'Confusion', 'Urgency(1-7)', 'Urgency', 'CourseType', 'forum_post_id',
       'course_display_name', 'forum_uid', 'created_at', 'post_type',
       'anonymous', 'anonymous_to_peers', 'up_count', 'comment_thread_id',
       'reads'],
      dtype='object')

In [47]:
df

Unnamed: 0,Text,Text_PT,Opinion(1/0),Question(1/0),Answer(1/0),Sentiment(1-7),Sentiment(1-7)GPT,Sentiment,Confusion(1-7),Confusion,...,forum_post_id,course_display_name,forum_uid,created_at,post_type,anonymous,anonymous_to_peers,up_count,comment_thread_id,reads
0,Interesting! How often we say those things to ...,Interessante! Quantas vezes dizemos essas cois...,1,0,0,6.5,6.5,1,2.0,0,...,5225177f2c501f0a00000015,Education/EDUC115N/How_to_Learn_Math,30CADB93E6DE4711193D7BD05F2AE95C,2013-09-02 22:55:59,Comment,0.0,0.0,0.0,5221a8262cfae31200000001,41.0
1,"What is \Algebra as a Math Game\"" or are you j...",O que é \ Álgebra como um jogo de matemática \...,0,1,0,4.0,3.5,1,5.0,1,...,5207d0e9935dfc0e0000005e,Education/EDUC115N/How_to_Learn_Math,37D8FAEE7D0B94B6CFC57D98FD3D0BA5,2013-08-11 17:59:05,Comment,0.0,0.0,0.0,520663839df35b0a00000043,55.0
2,I like the idea of my kids principal who says ...,Gosto da ideia do diretor dos meus filhos que ...,1,0,0,5.5,5.5,1,3.0,0,...,52052c82d01fec0a00000071,Education/EDUC115N/How_to_Learn_Math,CC11480215042B3EB6E5905EAB13B733,2013-08-09 17:53:06,Comment,0.0,0.0,0.0,51e59415e339d716000001a6,25.0
3,"From their responses, it seems the students re...","A partir de suas respostas, parece que os alun...",1,0,0,6.0,6.5,1,3.0,0,...,5240a45e067ebf1200000008,Education/EDUC115N/How_to_Learn_Math,C717F838D10E8256D7C88B33C43623F1,2013-09-23 20:28:14,CommentThread,0.0,0.0,0.0,,0.0
4,"The boys loved math, because \there is freedom...","Os meninos adoraram matemática, porque há libe...",1,0,0,7.0,6.5,1,2.0,0,...,5212c5e2dd10251500000062,Education/EDUC115N/How_to_Learn_Math,F83887D68EA48964687C6441782CDD0E,2013-08-20 01:26:58,CommentThread,0.0,0.0,0.0,,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29599,The p value tells us the probability of observ...,O valor de P nos diz a probabilidade de observ...,0,0,1,4.0,3.5,1,3.5,0,...,53e44042bf0e2c074e000039,Medicine/MedStats/Summer2014,83C5EC9DD9319435989AB52FA7E580BC,2014-08-08 03:13:06,Comment,0.0,0.0,0.0,53e4193fbce97d56a9000026,144.0
29600,given the null hypothesis is considered true,dada a hipótese nula é considerada verdadeira,0,0,1,4.0,3.5,1,3.5,0,...,53e442dfbf0e2c8d66000034,Medicine/MedStats/Summer2014,83C5EC9DD9319435989AB52FA7E580BC,2014-08-08 03:24:15,Comment,0.0,0.0,0.0,53e4193fbce97d56a9000026,144.0
29601,"> Hello Josh,_x0007__x0007_Is this hypothesis ...","> Olá Josh, _x0007__x0007_ é esta formulação d...",0,1,0,4.0,5.5,1,5.0,1,...,53e447cbbce97d56a9000032,Medicine/MedStats/Summer2014,83C5EC9DD9319435989AB52FA7E580BC,2014-08-08 03:45:15,Comment,0.0,0.0,0.0,53e4193fbce97d56a9000026,144.0
29602,"Hi Josh,_x0007__x0007__x0007__x0007_Looking at...","Oi Josh, _x0007__x0007__X0007__X0007_S, Lookin...",0,1,0,3.5,3.5,0,5.0,1,...,53e46e1cbce97d5d4300003c,Medicine/MedStats/Summer2014,673E487F9CE5343B8F32E7C7D49B6098,2014-08-08 06:28:44,Comment,0.0,0.0,0.0,53dfe280a8638d3f7a00002f,203.0


In [11]:
# Realizar o teste t de Student
t_stat, p_value = stats.ttest_ind(df["Sentiment(1-7)"], df["Sentiment(1-7)GPT"])

print(f"Estatística t: {t_stat}")
print(f"Valor p: {p_value}")

# Verificar a significância
print(
    f"{'Há' if p_value < 0.05 else 'Não há'} uma diferença significativa entre os rótulos reais e preditos."
)

Estatística t: -16.569212642279556
Valor p: 1.6014792354202478e-61
Há uma diferença significativa entre os rótulos reais e preditos.


In [22]:
# df['Sentiment(1-7)'].value_counts().sort_index().plot(kind='bar', figsize=(12, 6), title='Distribuição de rótulos reais')
df['Sentiment(1-7)'].value_counts().sort_index()

Sentiment(1-7)
1.0       14
1.5       46
2.0       97
2.5      353
3.0     1165
3.5     2712
4.0    16303
4.5     4261
5.0     2391
5.5     1184
6.0      686
6.5      310
7.0       82
Name: count, dtype: int64

In [31]:
df[df['Sentiment(1-7)GPT'] == 5.2]['Sentiment(1-7)GPT']

15628    5.2
Name: Sentiment(1-7)GPT, dtype: float64

In [45]:
df.iloc[15628]['Sentiment(1-7)GPT']

5.2

In [None]:
df["Sentiment(1-7)GPT"].value_counts().sort_index() #ola

Sentiment(1-7)GPT
0.5       1
1.0     229
1.5     302
2.0      72
2.5    4230
3.0     279
3.5    7503
4.0     923
4.5    4973
5.0    1713
5.2       1
5.5    6168
5.6       1
6.0     101
6.5    3093
7.0      15
Name: count, dtype: int64