In [1]:
import openai
import time
import pandas as pd
import tqdm
import os

# Function to get an integer response from OpenAI API
def get_integer_from_openai(prompt):
    openai.api_key = os.getenv("OPENAI_KEY")
    
    max_retries = 10
    retries = 0
    while retries < max_retries:
        try:
            # Call the OpenAI API
            response = openai.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ]
            )
            # Extract the text response from the API
            response_text = int(response.choices[0].message.content)
            break
        except Exception as e:
            retries += 1
            print('maxing out retries for the following error: ' + str(e))
            time.sleep(1)

    return response_text


def get_jsonl(df):
                
    jsonlist = []
    for index, row in df.iterrows():
        jsonlist.append({
        "custom_id": str(row['No']),
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model":"gpt-4o",
            "messages": [
                {
                    "role": "system",
                    "content": " "
                },
                {
                    "role": "user",
                    "content": row['prompt']
                }
            ],
            "max_tokens": 1000,
            "tools": [
                        {
                            "type": "function",
                            "function": {
                                "name": "task1",
                                "description": " ",
                                    "parameters":{
                                    "type":"object",
                                    "properties":{      
                                        "company_matching":{
                                            "type":"string",
                                            "description":"classify company matching into yes: same company or subsidiary of one another, or no",
                                            "enum":["yes", "no"]
                                        }     
                                        },
                                    "required":["company_matching"]
                                                }
                            },
                        }
                    ],
            "tool_choice": {"type": "function", "function": {"name": "task1"}}
        }
    })
    return jsonlist


In [18]:


loan_tables = pd.read_excel(open('/Users/zhenggong/Documents/GitHub/IBES_selected/data/Firm all.xlsx', 'rb'),
            sheet_name='Sheet1').dropna(subset=['firm1', 'firm2']).drop(columns=['使用内置搜索综合判断firm1和firm2是同一个公司或者有母子公司关系吗']).head(100)
print(loan_tables.head())

res = []
for _, row in tqdm.tqdm(loan_tables.iterrows()):
    print(row['firm1'], row['firm2'])

    prompt = f"""
    You are a stock analyst, the task is to decide whether two firm names represent the same company or is subsidiary of the other.
    The first firm name is {row['firm1']}, the second firm name is {row['firm2']}.

    Use not only the name similarity but also the context and history of the company. For example, Merrill Lynch and Bank of America are the same company or subsidiary of the other, while Merrill Lynch and Goldman Sachs are not.

    Try your best to make a decision with representing or not representing the same company. 
    return 0 if they are not the same company or subsidiary of the other, return 1 if they represent same company or subsidiary of the other. return ONLY the indicated integer and NOTHING ELSE.
    """
    
    result = get_integer_from_openai(prompt)
    print(result)

    res.append(result)

   No  ID_MS                    firm1                         firm2   ID_AS
0   1  15294          swiss bank corp           swiss national bank  202552
1   2  49619            chemical bank          chemical tankers inc  322023
2   3  50202         bank of new york  bank of new york mellon corp    2019
3   4  12677         bank of montreal              bank of montreal   15580
4   5  92873  detroit bank & trust co             detroit edison co   65089


0it [00:00, ?it/s]

swiss bank corp swiss national bank


1it [00:01,  1.07s/it]

0
chemical bank chemical tankers inc


2it [00:01,  1.01it/s]

0
bank of new york bank of new york mellon corp


3it [00:02,  1.03it/s]

1
bank of montreal bank of montreal


4it [00:03,  1.00s/it]

1
detroit bank & trust co detroit edison co


5it [00:04,  1.17it/s]

0
dai-ichi kangyo bank ltd dai-ichi kangyo bank ltd


6it [00:05,  1.33it/s]

1
sanpaolo banco di napoli spa [ex-banco di napoli] risanamento napoli spa


7it [00:05,  1.42it/s]

0
bank of nova scotia bank of nova scotia


8it [00:06,  1.29it/s]

1
national westminster bank plc natl westminster bank


9it [00:08,  1.14s/it]

1
first national bank of birmingham first national bank alaska


10it [00:10,  1.33s/it]

0
arab banking corp bsc [abc] arab banking corp


11it [00:11,  1.12s/it]

1
bank of scotland plc bank of scotland


12it [00:11,  1.01it/s]

1
bnp paribas [ex-banque paribas] bnp paribas


13it [00:12,  1.03s/it]

1
manufacturers hanover trust manufacturers services ltd


14it [00:13,  1.06it/s]

0
westdeutsche landesbank gz westdeutsche landesbank


15it [00:14,  1.18it/s]

1
mercantile national bank mercantile bank plc


16it [00:15,  1.14it/s]

0
united bank of denver united bankers oyj


17it [00:15,  1.15it/s]

0
gulf international bank bsc [gib] gulf international services


18it [00:17,  1.08it/s]

0
bank of ireland group bank of ireland group plc


19it [00:18,  1.16s/it]

1
exchange national bank the national bank


20it [00:19,  1.01s/it]

0
union bank na union bank plc


21it [00:20,  1.12it/s]

0
union bank na union bank ltd


22it [00:21,  1.09it/s]

0
nederlandsche middenstandsbank nv nederlandse spoorwegen


23it [00:21,  1.13it/s]

0
first national bank minneapolis first national bank modaraba


24it [00:22,  1.23it/s]

0
standard chartered bank plc [scb] standard chartered bank


25it [00:23,  1.31it/s]

1
scandinavian bank group plc scandinavian tobacco group


26it [00:24,  1.22it/s]

0
royal bank of scotland plc [rbs] royal bank of scotland ltd


27it [00:24,  1.22it/s]

1
sanpaolo imi spa [ex-istituto bancario sanpaolo di torino spa] san paolo-imi spa


28it [00:25,  1.19it/s]

1
dresdner bank ag dresdner bank ag


29it [00:26,  1.32it/s]

1
national australia bank ltd [nab] national australia bk


30it [00:26,  1.45it/s]

1
cibc [canadian imperial bank of commerce] canadian imperial bank


31it [00:27,  1.38it/s]

1
bank of america bank of africa - mali


32it [00:28,  1.44it/s]

0
bank of america bank of africa- benin


33it [00:28,  1.46it/s]

0
first interstate bank of california first internatl bank israel


34it [00:30,  1.20it/s]

0
toronto dominion bank toronto dominion bank


35it [00:31,  1.09it/s]

1
bankers trust co bankers investment trust plc


36it [00:32,  1.05it/s]

0
citibank citybank


37it [00:32,  1.14it/s]

0
chase manhattan bank manhattan bancorp


38it [00:33,  1.31it/s]

0
chase manhattan bank chase perdana bhd


39it [00:34,  1.44it/s]

0
saudi international bank sohar international bank


40it [00:34,  1.42it/s]

0
barclays bank plc barclays bank plc


41it [00:35,  1.47it/s]

1
continental bank continental seeds


42it [00:35,  1.50it/s]

0
bank of boston bank of beirut


43it [00:37,  1.21it/s]

0
fuji bank ltd fuji bank ltd


44it [00:38,  1.12s/it]

1
european american bank european american resources


45it [00:39,  1.03it/s]

0
security pacific national bank security pa financial corp


46it [00:40,  1.03it/s]

0
credit suisse ag credit suisse ag


47it [00:41,  1.15it/s]

1
royal bank of canada royal bank of canada


48it [00:42,  1.15it/s]

1
bnp paribas [ex-banque nationale de paris] bnp paribas all income fund


49it [00:42,  1.26it/s]

1
hongkong & shanghai banking corp ltd hongkong & shanghai hotels


50it [00:43,  1.40it/s]

0
california first bank california bancorp


51it [00:44,  1.27it/s]

0
first pennsylvania bank first bank


52it [00:44,  1.35it/s]

0
citizens & southern national bank citizens south banking corp


53it [00:45,  1.39it/s]

1
industrial bank of japan ltd industrial bank of japan ltd


54it [00:47,  1.09s/it]

1
midland bank plc midland bank limited


55it [00:48,  1.23s/it]

1
westpac banking corp westpac banking corp


56it [00:49,  1.01s/it]

1
union bank of switzerland union bank of israel ltd


57it [00:50,  1.11it/s]

0
societe generale sa societe generale group


58it [00:51,  1.02it/s]

1
sumitomo bank sumitomo mitsui trust bank l


59it [00:52,  1.05s/it]

1
credit industriel et commercial de paris cic (credit industriel comm)


60it [00:53,  1.07s/it]

1
northern trust northern venture trust


61it [00:55,  1.32s/it]

0
sanwa bank ltd sanwa bank ltd


62it [00:57,  1.40s/it]

1
national westminster bank usa natl westminster bank


63it [00:57,  1.17s/it]

1
lincoln national bank & trust co lincoln national cv sec


64it [00:58,  1.18s/it]

0
credit agricole sa credit agricole sa


65it [00:59,  1.01s/it]

1
rainier national bank herald national bank


66it [01:00,  1.08s/it]

0
mercantile trust co mercantile bank corp


67it [01:01,  1.04s/it]

0
valley national bank valley national bancorp


68it [01:02,  1.04s/it]

1
bank of the southwest bank of the carolinas


69it [01:03,  1.08it/s]

0
deutsche bank ag deutsche bank ag


70it [01:05,  1.24s/it]

1
manufacturers hanover bank delaware manufacturers services ltd


71it [01:06,  1.23s/it]

0
textron inc textron inc


72it [01:07,  1.05s/it]

1
bank of tokyo ltd bank of tianjin co ltd


73it [01:08,  1.01s/it]

0
first union national bank of north carolina first national bank modaraba


74it [01:09,  1.01it/s]

0
mitsubishi bank ltd mitsubishi trust & banking


75it [01:10,  1.12s/it]

1
centerre bank centerstate bank corp


76it [01:11,  1.16s/it]

0
first national bank of atlanta first national bank alaska


77it [01:12,  1.11s/it]

0
mitsui bank ltd mitsui osk lines ltd


78it [01:13,  1.06s/it]

0
banca commerciale italiana spa banca popolare italiana


79it [01:14,  1.02it/s]

0
commerzbank ag commerzbank ag


80it [01:15,  1.11it/s]

1
generale bank sa generalfinance spa


81it [01:17,  1.28s/it]

0
wachovia bank wachovia corp-old


82it [01:18,  1.07s/it]

1
salomon brothers salomon brothers fund inc


83it [01:18,  1.07it/s]

1
chuo trust & banking co ltd toyo trust & banking co


84it [01:19,  1.23it/s]

0
yasuda trust & banking co ltd toyo trust & banking co


85it [01:20,  1.21it/s]

0
hokkaido takushoko bank hokkaido shinko co ltd


86it [01:20,  1.23it/s]

0
taiyo kobe bank ltd taiko bank ltd


87it [01:22,  1.07it/s]

0
mitsui trust & banking corp mitsubishi trust & banking


88it [01:23,  1.01s/it]

0
mitsubishi trust & banking corp mitsubishi trust & banking


89it [01:23,  1.10it/s]

1
nippon credit bank ltd kita-nippon bank ltd


90it [01:24,  1.22it/s]

0
sumitomo trust & banking co ltd sumitomo mitsui trust bank l


91it [01:25,  1.27it/s]

1
kyowa bank ltd towa bank ltd


92it [01:25,  1.27it/s]

0
kyowa bank ltd howa bank ltd


93it [01:28,  1.19s/it]

0
daiwa bank ltd daiwa bank


94it [01:28,  1.03s/it]

1
first city national bank first national bank alaska


95it [01:29,  1.00it/s]

0
first interstate ltd first interstate bancsystem


96it [01:30,  1.17it/s]

1
first interstate bancorp first internet bancorp


97it [01:30,  1.23it/s]

0
bank of tokyo trust co bank of tianjin co ltd


98it [01:31,  1.15it/s]

0
hypobank international sa cm international sa


99it [01:32,  1.10it/s]

0
united virginia bank united bank ltd


100it [01:34,  1.06it/s]

0





In [19]:
loan_tables['result'] = res
loan_tables.to_csv('loan_tble_matching_result.csv', index=False)

In [26]:
# prepare batch file jsonl
loan_tables = pd.read_excel(open('/Users/zhenggong/Documents/GitHub/IBES_selected/data/Firm all.xlsx', 'rb'),
            sheet_name='Sheet1').dropna(subset=['firm1', 'firm2']).drop(columns=['使用内置搜索综合判断firm1和firm2是同一个公司或者有母子公司关系吗'])
print(loan_tables.head())

def add_prompt(row):
    return f"""You are a stock analyst, the task is to decide whether two firm names represent the same company or is subsidiary of the other.
    The first firm name is {row['firm1']}, the second firm name is {row['firm2']}.

    Use not only the name similarity but also the context and history of the company. For example, Merrill Lynch and Bank of America are the same company or subsidiary of the other, while Merrill Lynch and Goldman Sachs are not.

    return 'no' if they are not the same company or subsidiary of the other, return 'yes' if they represent same company or subsidiary of the other. return ONLY the indicated integer and NOTHING ELSE.
    """
loan_tables['prompt'] = loan_tables.apply(lambda row: add_prompt(row), axis=1)
print(loan_tables)

import json
res = get_jsonl(loan_tables.iloc[:30000])
# Write data to JSONL file
with open('loan_jsonl_1.jsonl', 'w') as file:
    for user in res:
        file.write(json.dumps(user) + '\n')

res = get_jsonl(loan_tables.iloc[30000:60000])
# Write data to JSONL file
with open('loan_jsonl_2.jsonl', 'w') as file:
    for user in res:
        file.write(json.dumps(user) + '\n')

res = get_jsonl(loan_tables.iloc[60000:])
# Write data to JSONL file
with open('loan_jsonl_3.jsonl', 'w') as file:
    for user in res:
        file.write(json.dumps(user) + '\n')


   No  ID_MS                    firm1                         firm2   ID_AS
0   1  15294          swiss bank corp           swiss national bank  202552
1   2  49619            chemical bank          chemical tankers inc  322023
2   3  50202         bank of new york  bank of new york mellon corp    2019
3   4  12677         bank of montreal              bank of montreal   15580
4   5  92873  detroit bank & trust co             detroit edison co   65089
          No   ID_MS                                       firm1  \
0          1   15294                             swiss bank corp   
1          2   49619                               chemical bank   
2          3   50202                            bank of new york   
3          4   12677                            bank of montreal   
4          5   92873                     detroit bank & trust co   
...      ...     ...                                         ...   
93662  93663  177598  safeguard world international holdings ltd   


In [7]:
# load in batched output 
import json
# import the file back row by row
res_data = []
with open('/Users/zhenggong/Documents/GitHub/IBES_selected/runnables/batch_jFQNkXpSipDZbkqXn6iCvPYC_output.jsonl', 'r') as f:
    for line in f:

        json_obj = json.loads(line)
        response = json.loads(json_obj['response']['body']['choices'][0]['message']['tool_calls'][0]['function']['arguments'])

        rowid = json_obj['custom_id']

        data = {'No': rowid,
            'company_matching': response['company_matching'],
                'model': json_obj['response']['body']['model'],
                }
        res_data.append(data)

with open('/Users/zhenggong/Documents/GitHub/IBES_selected/runnables/batch_8lBh8IBHZFCaY6xbyVp1HeyO_output.jsonl', 'r') as f:
    for line in f:

        json_obj = json.loads(line)
        response = json.loads(json_obj['response']['body']['choices'][0]['message']['tool_calls'][0]['function']['arguments'])

        rowid = json_obj['custom_id']

        data = {'No': rowid,
            'company_matching': response['company_matching'],
                'model': json_obj['response']['body']['model'],
                }
        res_data.append(data)


with open('/Users/zhenggong/Documents/GitHub/IBES_selected/runnables/batch_cJZPLtO8BMCvmC7iVBzWPzf3_output.jsonl', 'r') as f:
    for line in f:

        json_obj = json.loads(line)
        response = json.loads(json_obj['response']['body']['choices'][0]['message']['tool_calls'][0]['function']['arguments'])

        rowid = json_obj['custom_id']

        data = {'No': rowid,
            'company_matching': response['company_matching'],
                'model': json_obj['response']['body']['model'],
                }
        res_data.append(data)

res_data = pd.DataFrame(res_data)
res_data['No'] = res_data['No'].astype(int)

loan_tables = pd.read_excel(open('/Users/zhenggong/Documents/GitHub/IBES_selected/data/Firm all.xlsx', 'rb'),
            sheet_name='Sheet1').dropna(subset=['firm1', 'firm2']).drop(columns=['使用内置搜索综合判断firm1和firm2是同一个公司或者有母子公司关系吗'])
loan_tables['No'] = loan_tables['No'].astype(int)

res_data = pd.merge(res_data, loan_tables, left_on=['No'], right_on=['No'], how='left')
print(res_data)
res_data.to_csv('loan_tble_matching_result.csv', index=False)

          No company_matching              model   ID_MS  \
0          1               no  gpt-4o-2024-05-13   15294   
1          2               no  gpt-4o-2024-05-13   49619   
2          3              yes  gpt-4o-2024-05-13   50202   
3          4              yes  gpt-4o-2024-05-13   12677   
4          5               no  gpt-4o-2024-05-13   92873   
...      ...              ...                ...     ...   
93662  59996              yes  gpt-4o-2024-05-13   22604   
93663  59997               no  gpt-4o-2024-05-13   27118   
93664  59998              yes  gpt-4o-2024-05-13   13947   
93665  59999              yes  gpt-4o-2024-05-13  102132   
93666  60000               no  gpt-4o-2024-05-13  102088   

                                firm1                         firm2   ID_AS  
0                     swiss bank corp           swiss national bank  202552  
1                       chemical bank          chemical tankers inc  322023  
2                    bank of new york  bank o