## Estimate Cost

In [84]:
import openai
import pandas as pd
import tiktoken

In [85]:
data = pd.read_csv("clean.csv")

In [95]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
print(encoding.encode("tiktoken is great!"))

[83, 1609, 5963, 374, 2294, 0]


In [96]:
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [97]:
data['token_count'] = data.text.apply(lambda x: num_tokens_from_string(x))

In [98]:
token_count = data['token_count'].sum()
token_count

56630

In [99]:
# Pricing
standard_cost_per_1000_tokens = 0.00500
batch_api_cost_per_1000_tokens = 0.00250

# Calculate costs
standard_cost = (token_count / 1000) * standard_cost_per_1000_tokens
batch_api_cost = (token_count / 1000) * batch_api_cost_per_1000_tokens

print(f"Standard API Cost: ${standard_cost:.4f} USD")
print(f"Batch API Cost: ${batch_api_cost:.4f} USD")


Standard API Cost: $0.2832 USD
Batch API Cost: $0.1416 USD


In [100]:
from openai import OpenAI
from dotenv import load_dotenv
import json
import time
import os
load_dotenv()
client = OpenAI()

In [274]:
tasks = []
system_prompt = "You are an AI language model trained to analyze and detect the sentiment of tweet."

for idx in range(len(data)):
    row = data.iloc[idx]
    text = row.text
    task = {
        "custom_id": f"task-{idx}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4-turbo",
            "temperature": 0,
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": f"Analyze the following tweet and determine if the sentiment is: positive, negative or neutral. Return only a single word, either POSITIVE, NEGATIVE or NEUTRAL: {text}"
                }
            ],
        }
    }

    tasks.append(task)

In [275]:
tasks[0]

{'custom_id': 'task-0',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-3.5-turbo-0125',
  'temperature': 0.1,
  'messages': [{'role': 'system',
    'content': 'You are an AI language model trained to analyze and detect the sentiment of tweet.'},
   {'role': 'user',
    'content': 'Analyze the following tweet and determine if the sentiment is: positive, negative or neutral. Return only a single word, either POSITIVE, NEGATIVE or NEUTRAL: يارب لقد تجَبّرُوا؛ فأرهم جبروتك.. يارب لقد تجَبّرُوا؛ فأرهم جبروتك.. يارب لقد تجَبّرُوا؛ فأرهم جبروتك.. يارب لقد تجَبّرُوا؛ فأرهم جبروتك.. يارب لقد تجَبّرُوا؛ فأرهم جبروتك.. يارب لقد تجَبّرُوا؛ فأرهم جبروتك.. يارب لقد تجَبّرُوا؛ فأرهم جبروتك..'}]}}

In [368]:
len(tasks)

500

In [370]:
file_name = "batch_tasks.jsonl"
with open(file_name, 'w') as file:
    for task in tasks:
        file.write(json.dumps(task) + "\n")

In [299]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [317]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [350]:
import time 

batch_job = client.batches.retrieve(batch_job.id)
batch_job.status

while batch_job.status not in ["completed","failed","expired"]:
    batch_job = client.batches.retrieve(batch_job.id)
    print(batch_job.status)
    time.sleep(30)
else:
    successed_precentage = (batch_job.request_counts.completed + batch_job.request_counts.failed) / batch_job.request_counts.total
    print(successed_precentage)

'completed'

In [355]:
result_file_id = batch_job.output_file_id
results = client.files.content(result_file_id).text
results = results.split("\n")

In [373]:
data_results = {"id":[], "sentiment":[]}
for result in results:
    json_result = json.loads(result)
    tweet_id = json_result['custom_id']
    sentiment = json_result['response']['body']['choices'][0]['message']['content'].lower()
    data_results['id'].append(tweet_id)
    data_results['sentiment'].append(sentiment)
df = pd.DataFrame(data_results, orient='index')

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [382]:
data_results['id'][1] = 1824046452932329533

In [383]:
df = pd.DataFrame(data_results)

In [384]:
df

Unnamed: 0,id,sentiment
0,1824058774505603421,negative
1,1824046452932329533,positive


In [379]:
test_df = pd.read_csv("../test.csv")
test_df

Unnamed: 0.1,Unnamed: 0,url,twitterUrl,id,text,retweetCount,replyCount,likeCount,quoteCount,createdAt,bookmarkCount,isRetweet,isQuote
0,0,https://x.com/sherinhelal555/status/1824058774...,https://twitter.com/sherinhelal555/status/1824...,1824058774505603421,أسئلة مشروعة: ليه متجمعين عند البريد؟ ليه مسمو...,60,25,143,8,Thu Aug 15 12:21:15 +0000 2024,4,False,False
1,2,https://x.com/ammaralihassan/status/1824046452...,https://twitter.com/ammaralihassan/status/1824...,1824046452932329533,البروفسور ميرشايمر صاحب رؤية عمرها ربع قرن تبي...,82,7,243,1,Thu Aug 15 11:32:18 +0000 2024,20,False,False
2,3,https://x.com/EmaarW/status/1824043207702814942,https://twitter.com/EmaarW/status/182404320770...,1824043207702814942,تتجه لتدريس اكثر من ٥ علوم حديثه واضافت اللغه ...,79,62,769,4,Thu Aug 15 11:19:24 +0000 2024,13,False,False


In [386]:
pd.merge(test_df, df, on='id')

Unnamed: 0.1,Unnamed: 0,url,twitterUrl,id,text,retweetCount,replyCount,likeCount,quoteCount,createdAt,bookmarkCount,isRetweet,isQuote,sentiment
0,0,https://x.com/sherinhelal555/status/1824058774...,https://twitter.com/sherinhelal555/status/1824...,1824058774505603421,أسئلة مشروعة: ليه متجمعين عند البريد؟ ليه مسمو...,60,25,143,8,Thu Aug 15 12:21:15 +0000 2024,4,False,False,negative
1,2,https://x.com/ammaralihassan/status/1824046452...,https://twitter.com/ammaralihassan/status/1824...,1824046452932329533,البروفسور ميرشايمر صاحب رؤية عمرها ربع قرن تبي...,82,7,243,1,Thu Aug 15 11:32:18 +0000 2024,20,False,False,positive
