# Theme classification using LLMs

Previous release of the tool used a ML approach to classify sentences according a type of musical meetup theme.
In this realease we take advantage of Generative AI and LLM tools and follow a zero-classification approach to perform the classification task.

We use the OpenAI API, and follow these steps:
- Design a prompt
    - Provide context
    - Describe the expected output
    - Describe the expected results


The results are returned in JSON format

Results are then processed and stored in CSV format

In [1]:
import os
import pandas as pd
import re
import numpy as np
import sys
import json
import pandas as pd
# import config
import time
from unidecode import unidecode
import tiktoken

In [2]:
# !pip install openai
# !pip install unidecode
# !pip install tiktoken

In [3]:
# LLM configuration using OpenAI
# import openai
# new
from openai import OpenAI
# Zero TEMPERATURE value to obtain the best answer
DEFAULT_TEMPERATURE = 0
DEFAULT_MAX_TOKENS = 500
DEFAULT_MODEL = "gpt-3.5-turbo"

In [4]:
# GETOPENAIKEY
def getOpenaiKey():
    return "OPENAI-KEY"

# openai.api_key = getOpenaiKey()
openai = OpenAI(
  api_key=getOpenaiKey()
)

In [5]:
# send prompt using API
def send_prompt(messages):
    resp = openai.chat.completions.create(
        messages=messages,
        model=DEFAULT_MODEL,
        temperature=DEFAULT_TEMPERATURE,
        max_tokens=DEFAULT_MAX_TOKENS
    )
    # answer = resp.choices[0].message["content"]
    answer = resp.choices[0].message.content
    # print(resp.choices[0].message.content)
    # answer = resp.choices[0].text
    print(dict(resp).get('usage'))
    return answer

def send_prompt_ranking(message):
    system_msg = """You are a knowledge classification system that annotates sentences according to their main topic.
            Respond in json format using the following keys: thm_type_1, thm_type_2, thm_explanation_1 and thm_explanation_2.
            The value for thm_type_1 is the first most probable topic, use only one of the following topics: ['Music making', 'Business meeting', 'Personal life', 'Coincidence', 'Public celebration', 'Education' ].
            The value for thm_type_2 is the second most probable topic, unse only one of the following keys: ['Music making', 'Business meeting', 'Personal life', 'Coincidence', 'Public celebration', 'Education' ].
            The value for thm_explanation_1 should be a short explanation for the topic in thm_type_1. Less than 100 characters.
            The value for thm_explanation_2 should be a short explanation for the topic in thm_type_2. Less than 100 characters.
            """
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": f"Sentence: {message}"},
    ]
    return send_prompt(messages)

def getThemeByRanking(sent):
    try:
        answer = send_prompt_ranking(snt)
        data = json.loads(answer)
        print(data)
        results_list.append(data)
    except Exception as e:
        print(e, file=sys.stdout)
        # raise
    finally:
        count +=1
        if (count % 20) == 0:
            print("waiting...")
            time.sleep(30)

    res_temp_df = pd.json_normalize(results_list)
    print(res_temp_df.info())
    df = results_df.join(res_temp_df)
    print(df.head())
    df.to_csv('results-theme-ranking/'+file, index=False)
    
def sentenceFormating(text):
    sent_string = re.sub(r'([.?!])\1+', r'\1',re.sub(r'\.{3,}', '...',unidecode(str(text).strip())))
    return text

In [6]:
def writeJSONFile(data,fileName):
    # You can also store the data in a file, for example, as a JSON file
    with open(fileName, "w") as json_file:
        json.dump(data, json_file)
# print(completion.model_dump_json(indent=2))
def readJSONFile(fileName):
    # To read the stored JSON data back into Python:
    with open(fileName, "r") as json_file:
        stored_data = json.load(json_file)
        
    return stored_data

In [39]:
results_list = []
text = "He was playing in Italy at the time and returned to Argentina to meet her under the pretense of a legal problem. In early 2013 Jimena left the cast of Sos mi hombre and went to live in Italy with her boyfriend, the footballer Daniel Osvaldo and he left his wife Elena Braccini and his two daughters."
print("1. Try send prompt")
answer = send_prompt_ranking(text)
print("2. Response prompt")
# Parse the JSON response into a Python dictionary
data = json.loads(answer)
results_list.append(data)
writeJSONFile(results_list,'responseLLM.json')
dataJSON = readJSONFile('responseLLM.json')
res_temp_df = pd.json_normalize(dataJSON)
res_temp_df.info()
res_temp_df.head()

1. Try send prompt
{"thm_type_1": "Personal life", "thm_type_2": "Music making", "thm_explanation_1": "Relationship and family matters", "thm_explanation_2": "Involvement of a footballer"}
CompletionUsage(completion_tokens=53, prompt_tokens=299, total_tokens=352)
2. Response prompt
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   thm_type_1         1 non-null      object
 1   thm_type_2         1 non-null      object
 2   thm_explanation_1  1 non-null      object
 3   thm_explanation_2  1 non-null      object
dtypes: object(4)
memory usage: 160.0+ bytes


Unnamed: 0,thm_type_1,thm_type_2,thm_explanation_1,thm_explanation_2
0,Personal life,Music making,Relationship and family matters,Involvement of a footballer


# Execute Classification for all sentences

In [6]:
for chunk in pd.read_csv('list_wikiIdSample.csv', chunksize=50):
# for chunk in pd.read_csv('totalBiographiesEntities.csv', chunksize=50):
# for chunk in pd.read_csv('totalBiographiesBenchmark.csv', chunksize=50):
# for chunk in pd.read_csv('totalTest.csv', chunksize=10):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    # for each file
    for file_name_item in df_file_name.itertuples():
        print(file_name_item.file_name.replace(".txt",".csv"))
        if os.path.isfile('cacheThemeClassificationLLMResponse/'+file_name_item.file_name.replace(".csv",".json")):
            continue
        file_exists = os.path.isfile('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
        # file_exists = os.path.isfile(file_name_item.file_name.replace(".txt",".csv"))
        
        if file_exists:
            sentences_df = pd.read_csv('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
            # sentences_df = pd.read_csv(file_name_item.file_name.replace(".txt",".csv"))
            
            # obtain list of paragraphs
            print("Sentences to process: ",len(sentences_df))
            results_list = []
            errors_list = []
            # response_dict = {}
            count = 0
            for snt in sentences_df.itertuples():
                if len(snt.sentences)<=25:
                    continue
                try:
                    print("1. Try send prompt")
                    answer = send_prompt_ranking(str(snt.sentences).strip())
                    print("2. Response prompt")
                    # Parse the JSON response into a Python dictionary
                    data = json.loads(answer)
                    data["sentenceIndex"] = snt.sentenceIndex
                    data["paragraphIndex"] = snt.paragraphIndex
                    data["section"] = snt.section
                    data["sentences"] = snt.sentences
                    # print(data)
                    # response_dict.append(data)
                    results_list.append(data)
                except Exception as e:
                    print("Identifier: ",str(snt.sentenceIndex)," ",str(snt.paragraphIndex))
                    print(e, file=sys.stdout)
                    # save errors in list
                    errors_list.append({"sentenceIndex":str(snt.sentenceIndex),"paragraphIndex":str(snt.paragraphIndex),
                                       "section":snt.section,"sentences": str(snt.sentences)})
                    print("waiting...")
                    time.sleep(30)
                    # raise
                finally:
                    count +=1
                    if (count % 20) == 0:
                        print("waiting...")
                        time.sleep(15)
            
            # check for response errors and run LLM read again
            errors_df = pd.json_normalize(errors_list)
            if len(errors_df) > 0:
                for row in errors_df.itertuples():
                    try:
                        print("1. Try send prompt")
                        answer = send_prompt_ranking(str(row.sentences).strip())
                        print("2. Response prompt")
                        # Parse the JSON response into a Python dictionary
                        data = json.loads(answer)
                        data["sentenceIndex"] = row.sentenceIndex
                        data["paragraphIndex"] = row.paragraphIndex
                        data["section"] = row.section
                        data["sentences"] = row.sentences
                        # print(data)
                        # response_dict.append(data)
                        results_list.append(data)
                    except Exception as e:
                        print("Identifier: "+str(row.sentenceIndex)+" "+str(row.paragraphIndex))
                        print(e, file=sys.stdout)
                        time.sleep(30)
                        # raise
                    finally:
                        count +=1
                        if (count % 20) == 0:
                            print("waiting...")
                            time.sleep(15)
            # save final 
            writeJSONFile(results_list,'cacheThemeClassificationLLMResponse/'+file_name_item.file_name.replace(".csv",".json"))
    
    print("waiting at file level...")
    time.sleep(120)

10085.csv
9039.csv
21511.csv
45181.csv
49644.csv
50350.csv
57520.csv
99636.csv
180714.csv
312443.csv
2444917.csv
608845.csv
1048151.csv
2232977.csv
1913885.csv
1790990.csv
409969.csv
1709886.csv
1551347.csv
50782750.csv
181946.csv
579599.csv
50963136.csv
226142.csv
1422240.csv
1174545.csv
70020.csv
752694.csv
312781.csv
2898019.csv
2253021.csv
3450382.csv
2553865.csv
3263983.csv
2320846.csv
144624.csv
827409.csv
50902387.csv
671637.csv
562392.csv
1396921.csv
2815597.csv
858538.csv
2269540.csv
1205991.csv
701860.csv
2334176.csv
252147.csv
652114.csv
320685.csv
waiting at file level...
8716.csv
3606266.csv
576282.csv
1232492.csv
113049.csv
43165.csv
3770842.csv
221191.csv
1491559.csv
1790137.csv
529161.csv
167975.csv
3126224.csv
78231.csv
1022191.csv
154038.csv
223497.csv
356414.csv
2625004.csv
1359335.csv
3081864.csv
2450365.csv
63747.csv
739770.csv
1566844.csv
50230.csv
439467.csv
181985.csv
2296293.csv
2520364.csv
2682932.csv
3302723.csv
51560453.csv
891378.csv
3236079.csv
1914666.csv

# Execute only HM annotated sentences

In [8]:
# for chunk in pd.read_csv('toAnnotatePart6.csv', chunksize=100):
# for chunk in pd.read_csv('totalBiographiesEntities.csv', chunksize=50):
# for chunk in pd.read_csv('totalBiographiesBenchmark.csv', chunksize=50):
for chunk in pd.read_csv('totalBiosToProcessAdditionalInfo.csv', chunksize=10):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    # for each file
    count = 0
    total_num_tokens = 0
    for file_name_item in df_file_name.itertuples():
        print(file_name_item.file_name)
        if os.path.isfile('cacheThemeClassificationResponse/'+file_name_item.file_name.replace(".csv",".json")):
            continue
        # file_exists = os.path.isfile('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
        # file_exists = os.path.isfile(file_name_item.file_name.replace(".txt",".csv"))
        # Filter only the HM annotations
        meetupsAnnotations_df = pd.read_csv('meetupsAnnotations/'+file_name_item.file_name)
        # Filter HM only
        meetupsAnnotations_df = meetupsAnnotations_df.query('annotation == "HM"')
        if len(meetupsAnnotations_df)==0:
            continue
        sentences_df = pd.read_csv('indexedSentences/'+file_name_item.file_name,usecols=['sentences','sentenceIndex','paragraphIndex'])
           
        
        results_list = []
        errors_list = []
        encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
        
        for meetup in meetupsAnnotations_df.itertuples():
            # print(pd.isnull(meetup.before))
            text = ''
            if pd.isnull(meetup.before):
                sent_df = sentences_df.loc[(sentences_df.paragraphIndex == int(meetup.paragraphIndex))&(sentences_df.sentenceIndex == int(meetup.after))].copy()
                text = sentenceFormating(str(sent_df.iloc[0]['sentences']))
            else:
                sent_df = sentences_df[(sentences_df['paragraphIndex'] == meetup.paragraphIndex) & (sentences_df['sentenceIndex'].isin([int(meetup.before), int(meetup.after)]))].copy()
                text = sentenceFormating(str(sent_df.iloc[0]['sentences']) + " " + str(sent_df.iloc[1]['sentences']))
            num_tokens = len(encoding.encode(text))
            # print(sent_df)
            print("after: ", meetup.after, ". paragraph: "+ str(meetup.paragraphIndex))
            print(text)
            
            # response_dict = {}
            try:
                print("1. Try send prompt")
                answer = send_prompt_ranking(text)
                
                print("2. Response prompt")
                # Parse the JSON response into a Python dictionary
                data = json.loads(answer)
                data["sentenceIndex"] = meetup.after
                data["paragraphIndex"] = meetup.paragraphIndex
                # data["section"] = snt.section
                # data["sentences"] = snt.sentences
                # print(data)
                # response_dict.append(data)
                data["sentenceIndexPrev"] = meetup.before
                results_list.append(data)
            except Exception as e:
                print("Identifier: ",str(meetup.after)," ",str(meetup.paragraphIndex))
                print(e, file=sys.stdout)
                # save errors in list
                errors_list.append({"sentenceIndex":str(meetup.after),"paragraphIndex":str(meetup.paragraphIndex),
                                    "sentences": str(text),"sentenceIndexPrev":meetup.before})
                                   # "section":snt.section,"sentences": str(snt.sentences)})
                print("waiting...")
                time.sleep(30)
                # raise
            finally:
                total_num_tokens +=num_tokens
                print("total_num_tokens: ", total_num_tokens)
                if total_num_tokens > 2500:
                    time.sleep(20)
                    total_num_tokens = 0
                count +=1
                if (count >=4000):
                    print("waiting...")
                    time.sleep(30)
                    count = 0
            
        # check for response errors and run LLM read again
        errors_df = pd.json_normalize(errors_list)
        if len(errors_df) > 0:
            for row in errors_df.itertuples():
                try:
                    print("1. Try send prompt")
                    answer = send_prompt_ranking(str(row.sentences).strip())
                    print("2. Response prompt")
                    # Parse the JSON response into a Python dictionary
                    data = json.loads(answer)
                    data["sentenceIndex"] = row.sentenceIndex
                    data["paragraphIndex"] = row.paragraphIndex
                    # data["section"] = row.section
                    # data["sentences"] = row.sentences
                    data["sentenceIndexPrev"] = row.before
                    # print(data)
                    # response_dict.append(data)
                    results_list.append(data)
                except Exception as e:
                    print("Identifier: "+str(row.sentenceIndex)+" "+str(row.paragraphIndex))
                    print(e, file=sys.stdout)
                    time.sleep(30)
                    # raise
                finally:
                    count +=1
                    if (count % 20) == 0:
                        print("waiting...")
                        time.sleep(15)
        # save final 
        writeJSONFile(results_list,'cacheThemeClassificationResponse/'+file_name_item.file_name.replace(".csv",".json"))
        print("Waiting at file level...")
        time.sleep(2)
    print("waiting at batch level...")
    time.sleep(30)

1965784.csv
32264675.csv
2934315.csv
28312254.csv
8040240.csv
22459257.csv
3362986.csv
19474675.csv
51596657.csv
484364.csv
4241227.csv
46385848.csv
52279001.csv
8064528.csv
5143989.csv
15182641.csv
6975970.csv
1949333.csv
2470630.csv
18713679.csv
23394028.csv
1862380.csv
14565496.csv
21438871.csv
43618783.csv
37029067.csv
8623467.csv
65518206.csv
80664.csv
36018341.csv
37882107.csv
27043626.csv
2887963.csv
34546475.csv
4636486.csv
9342621.csv
757045.csv
22840223.csv
54322005.csv
56753343.csv
1471378.csv
33727377.csv
13259839.csv
4032265.csv
11728784.csv
6008101.csv
4412606.csv
46935921.csv
4118252.csv
2568814.csv
2742461.csv
39989000.csv
2177859.csv
67794551.csv
27539399.csv
33599262.csv
241976.csv
20614582.csv
3639488.csv
27672035.csv
45303952.csv
68265086.csv
41025333.csv
33429767.csv
1428081.csv
944408.csv
38206522.csv
57010402.csv
45719525.csv
26179378.csv
144108.csv
7074351.csv
46330765.csv
58687892.csv
28527621.csv
25184131.csv
22911033.csv
14323839.csv
47122240.csv
49408567.csv

# Process LLM response cache

Reading every JSON file in cache

Return a list object of files in the given folder

In [8]:
def get_top_two_columns(row, start_col, end_col):
    # Slice the row to consider only the specified range of columns
    row_slice = row[start_col:end_col + 1]
    
    # Get column names sorted by values in descending order
    sorted_columns = sorted(row_slice.items(), key=lambda x: x[1], reverse=True)
    
    # Extract the names of the top two columns
    top_column1, top_value1 = sorted_columns[0]
    top_column2, top_value2 = sorted_columns[1]

    return top_column1, top_value1, top_column2, top_value2

In [9]:
for chunk in pd.read_csv('totalBiosToProcessAdditionalInfo.csv', chunksize=50):
# for chunk in pd.read_csv('toAnnotatePart5.csv', chunksize=50):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    # for each file
    themes_df = pd.DataFrame()
    for file_name_item in df_file_name.itertuples():
        themes_df = pd.DataFrame()
        print(file_name_item.file_name)

        try:
            dataJSON = readJSONFile('cacheThemeClassificationLLMResponse/'+file_name_item.file_name.replace('.csv','.json'))
            res_temp_df = pd.json_normalize(dataJSON)
            res_temp_df['source'] = 'LLM'
            # 'thm_type_1','thm_type_2','thm_explanation_1','thm_explanation_2','sentenceIndex','paragraphIndex'
            # new_linkedEntities_df.drop(columns=['entity','offset','entType_y','tokenIndex','token'], inplace=True)
        except pd.errors.EmptyDataError:    
            print("EmptyDataError: Theme extraction with LLM. No results, no file")
            # What happends when there is no TIME? We still could have people, places and themes, therefor a HT? Shall continue?
            res_temp_df = pd.DataFrame(columns=['thm_type_1','thm_type_2','thm_explanation_1','thm_explanation_2','sentenceIndex','paragraphIndex','source'])
        except FileNotFoundError:
            print("FileNotFoundError: No theme using LLM.")
            res_temp_df = pd.DataFrame(columns=['thm_type_1','thm_type_2','thm_explanation_1','thm_explanation_2','sentenceIndex','paragraphIndex','source'])
        except Exception as e:
            print("Identifier: ",str(row['sentenceIndex'])," ",str(row['paragraphIndex']))
            traceback.print_exc()
            print(e, file=sys.stdout)
        
        # before,after,paragraphIndex,annotation,sentences
        meetupsAnnotations_df = pd.read_csv('meetupsAnnotations/'+file_name_item.file_name,usecols=['after','paragraphIndex','annotation'])
        meetupsAnnotations_df.rename(columns={'after': 'sentenceIndex'}, inplace=True)
        
        if len(res_temp_df)>0:
            meetups_merge_llm = pd.merge(meetupsAnnotations_df,res_temp_df,how='inner', on=['sentenceIndex','paragraphIndex'])
            new_llm = meetups_merge_llm[['thm_type_1','thm_type_2','thm_explanation_1','thm_explanation_2','sentenceIndex','paragraphIndex','source']]
        else:
            new_llm = res_temp_df.copy()

        # Filter HM and HT only
        meetupsAnnotations_df = meetupsAnnotations_df[~meetupsAnnotations_df['annotation'].isin(['N'])]
        if len(meetupsAnnotations_df)==0:
            continue
            
        try:
            meetupType_ml_df = pd.read_csv('cacheThemeClassificationMLResponse/'+file_name_item.file_name)
            # meetupType_ml_df['source'] = 'ML'
            meetups_merge_main = pd.merge(meetupsAnnotations_df,meetupType_ml_df,how='inner', on=['sentenceIndex','paragraphIndex'])
            # print(meetups_merge_main)
            # Iterate through each row and print the top two columns within the specified range
            new_columns = []
            for index, row in meetups_merge_main.iterrows():
                # candidate = meetups_merge_llm[(meetups_merge_llm['sentenceIndex'] == row['sentenceIndex'])&(meetups_merge_llm['paragraphIndex'] == row['paragraphIndex'])]
                # if len(candidate) > 1:
                #     # use the rows
                # else:
                top_column1, top_value1, top_column2, top_value2 = get_top_two_columns(row, 4, 9)
                new_columns.append([top_column1, "Confidence value: "+str(top_value1), top_column2, "Confidence value: "+str(top_value2),
                                    row['sentenceIndex'],row['paragraphIndex'],'ML'])
                # print(f"Row {index + 1}: Top columns are {top_column1} and {top_column2}")
                    # create a row 
            new_ml = pd.DataFrame(new_columns, columns=['thm_type_1','thm_type_2','thm_explanation_1','thm_explanation_2','sentenceIndex','paragraphIndex','source'])
        except Exception as e:
            print("Identifier: ",str(row['sentenceIndex'])," ",str(row['paragraphIndex']))
            traceback.print_exc()
            print(e, file=sys.stdout)

        themes_df = pd.concat([new_llm,new_ml],ignore_index = True)
        # print(themes_df)
        themes_df.sort_values(by=['paragraphIndex','sentenceIndex','source'],inplace=True)
        themes_df.drop_duplicates(subset=['paragraphIndex','sentenceIndex'],inplace=True)
        # print(themes_df)
        themes_df.to_csv('extractedMeetupTypes/'+file_name_item.file_name.replace(".json",".csv"), index=False)

2706137.json
597665.json
1272132.json
277524.json
3547359.json
18995.json
2320846.json
2553132.json
827624.json
1271437.json
918090.json
192024.json
671637.json
50734876.json
170021.json
3757644.json
186175.json
827409.json
2900106.json
1065581.json
144196.json
90698.json
152557.json
245879.json
259159.json
205508.json
647545.json
371878.json
1263963.json
1413284.json
2281945.json
1517721.json
3197212.json
625857.json
50798971.json
413078.json
1477038.json
762314.json
696261.json
3583583.json
2833181.json
761492.json
51940478.json
866345.json
152097.json
2444917.json
3450382.json
2704521.json
766300.json
874840.json
1049483.json
3301109.json
1181499.json
575924.json
312443.json
1787782.json
701860.json
950070.json
1024347.json
1277042.json
512497.json
759988.json
59216.json
3521750.json
647820.json
562392.json
51033485.json
855002.json
1174545.json
576282.json
984106.json
736528.json
356414.json
1709886.json
1077508.json
330103.json
99636.json
60907.json
589925.json
3740.json
1322881.j

In [6]:
# calculate tokens by biography sample
import tiktoken
from unidecode import unidecode

# def num_tokens_from_string(string: str, encoding_name: str) -> int:
#     encoding = tiktoken.get_encoding(encoding_name)
#     num_tokens = len(encoding.encode(string))
#     return num_tokens

# total tokens
total_tokens_benchmark = 0
# for chunk in pd.read_csv('list_wikiIdSample.csv', chunksize=50):
# for chunk in pd.read_csv('totalBiographiesEntities.csv', chunksize=50):
for chunk in pd.read_csv('totalBiographiesBenchmark.csv', chunksize=50):
# for chunk in pd.read_csv('totalTest.csv', chunksize=10):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    # for each file
    for file_name_item in df_file_name.itertuples():
        print(file_name_item.file_name.replace(".txt",".csv"))
        # if os.path.isfile('cacheThemeClassificationLLMResponse/'+file_name_item.file_name.replace(".csv",".json")):
        #     continue
        file_exists = os.path.isfile('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
        # file_exists = os.path.isfile(file_name_item.file_name.replace(".txt",".csv"))
        
        if file_exists:
            total_tokens = 0
            sentences_df = pd.read_csv('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
            # sentences_df = pd.read_csv(file_name_item.file_name.replace(".txt",".csv"))
            
            # obtain list of paragraphs
            print("Sentences to process: ",len(sentences_df))
            results_list = []
            errors_list = []
            # response_dict = {}
            count = 0
            for snt in sentences_df.itertuples():
                encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
                # num_tokens = len(encoding.encode(string))
                # optimise string
                # re.sub(r'\.{3,}', '...', text_without_line_breaks)
                # re.sub(r'([.?!])\1+', r'\1', text_without_exclamation)
                sent_string = re.sub(r'([.?!])\1+', r'\1',re.sub(r'\.{3,}', '...',unidecode(str(snt.sentences).strip())))
                # print(snt.sentences)
                # print(sent_string)
                num_tokens = len(encoding.encode(sent_string))
                # print(num_tokens)
                total_tokens += num_tokens
            # previous 0.0015 07/11 now input 0.0010, output 0.0020
            print("Total tokens: ",total_tokens," , Input value $", str(total_tokens*0.0010))
        total_tokens_benchmark += total_tokens
print("Total tokens benchmark for 10 biographies: ",total_tokens_benchmark," , Input value $", str(total_tokens_benchmark*0.0010))
# Total tokens benchmark for 10 biographies:  73682  , Input value $ 110.523
# Total tokens benchmark for 10 biographies:  73423  , Input value $ 110.1345
# reduced about 200 tokens using unidecode
# Update 07/11/23
# Total tokens benchmark for 10 biographies:  73423  , Input value $ 73.423

10085.csv
Sentences to process:  460
Total tokens:  13273  , Input value $ 13.273
21511.csv
Sentences to process:  263
Total tokens:  5900  , Input value $ 5.9
45181.csv
Sentences to process:  392
Total tokens:  8154  , Input value $ 8.154
49644.csv
Sentences to process:  563
Total tokens:  13603  , Input value $ 13.603
50350.csv
Sentences to process:  290
Total tokens:  7220  , Input value $ 7.22
57520.csv
Sentences to process:  182
Total tokens:  4319  , Input value $ 4.319
99636.csv
Sentences to process:  365
Total tokens:  8194  , Input value $ 8.194
180714.csv
Sentences to process:  364
Total tokens:  9336  , Input value $ 9.336
312443.csv
Sentences to process:  142
Total tokens:  2272  , Input value $ 2.2720000000000002
2444917.csv
Sentences to process:  47
Total tokens:  1152  , Input value $ 1.1520000000000001
Total tokens benchmark for 10 biographies:  73423  , Input value $ 73.423


In [5]:
# calculate tokens by biography sample
import tiktoken
from unidecode import unidecode

total_tokens_benchmark = 0
# for chunk in pd.read_csv('list_wikiIdSample.csv', chunksize=50):
# for chunk in pd.read_csv('totalBiographiesEntities.csv', chunksize=50):
for chunk in pd.read_csv('totalBiographiesBenchmark.csv', chunksize=50):
# for chunk in pd.read_csv('totalTest.csv', chunksize=10):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    # for each file
    for file_name_item in df_file_name.itertuples():
        print(file_name_item.file_name.replace(".txt",".csv"))
        # if os.path.isfile('cacheThemeClassificationLLMResponse/'+file_name_item.file_name.replace(".csv",".json")):
        #     continue
        file_exists = os.path.isfile('indexedParagraphs/'+file_name_item.file_name.replace(".txt",".csv"))
        # file_exists = os.path.isfile(file_name_item.file_name.replace(".txt",".csv"))
        
        if file_exists:
            total_tokens = 0
            paragraph_df = pd.read_csv('indexedParagraphs/'+file_name_item.file_name.replace(".txt",".csv"))
            # sentences_df = pd.read_csv(file_name_item.file_name.replace(".txt",".csv"))
            
            # obtain list of paragraphs
            print("Paragraphs to process: ",len(paragraph_df))
            results_list = []
            errors_list = []
            # response_dict = {}
            count = 0
            for prg in paragraph_df.itertuples():
                encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
                # num_tokens = len(encoding.encode(string))
                
                num_tokens = len(encoding.encode(unidecode(str(prg.paragraph).strip())))
                # print(num_tokens)
                total_tokens += num_tokens
            
            print("Total tokens: ",total_tokens," , Input value $", str(total_tokens*0.0010))
        total_tokens_benchmark += total_tokens
print("Total tokens benchmark for 10 biographies: ",total_tokens_benchmark," , Input value $", str(total_tokens_benchmark*0.0010))
# Total tokens benchmark for 10 biographies:  73682  , Input value $ 110.523
# Total tokens benchmark for 10 biographies:  73423  , Input value $ 110.1345
# reduced about 200 tokens using unidecode
# Using paragraphs
# Total tokens benchmark for 10 biographies:  73251  , Input value $ 109.8765
# Update 07/11/23
# Total tokens benchmark for 10 biographies:  73251  , Input value $ 73.251

10085.csv
Paragraphs to process:  44
Total tokens:  13246  , Input value $ 13.246
21511.csv
Paragraphs to process:  93
Total tokens:  5895  , Input value $ 5.8950000000000005
45181.csv
Paragraphs to process:  88
Total tokens:  8131  , Input value $ 8.131
49644.csv
Paragraphs to process:  89
Total tokens:  13588  , Input value $ 13.588000000000001
50350.csv
Paragraphs to process:  65
Total tokens:  7163  , Input value $ 7.163
57520.csv
Paragraphs to process:  87
Total tokens:  4317  , Input value $ 4.317
99636.csv
Paragraphs to process:  94
Total tokens:  8171  , Input value $ 8.171
180714.csv
Paragraphs to process:  54
Total tokens:  9317  , Input value $ 9.317
312443.csv
Paragraphs to process:  55
Total tokens:  2277  , Input value $ 2.277
2444917.csv
Paragraphs to process:  8
Total tokens:  1146  , Input value $ 1.1460000000000001
Total tokens benchmark for 10 biographies:  73251  , Input value $ 73.251
