In [None]:
# !pip install python-dotenv -q

In [None]:
# !git clone https://github.com/anzonyquispe/myneta_llm.git

Cloning into 'myneta_llm'...
remote: Enumerating objects: 22993, done.[K
remote: Counting objects: 100% (22993/22993), done.[K
remote: Compressing objects: 100% (13497/13497), done.[K
remote: Total 22993 (delta 9493), reused 22988 (delta 9491), pack-reused 0 (from 0)[K
Receiving objects: 100% (22993/22993), 9.59 MiB | 16.15 MiB/s, done.
Resolving deltas: 100% (9493/9493), done.
Updating files: 100% (20420/20420), done.


In [1]:
import openai
import os
from getpass import getpass
from dotenv import find_dotenv, load_dotenv
# from IPython.display import Markdown

load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

# Prompts

In [2]:
ANALYSIS_PROMPT = "You are a research assistant interested in political capture in India. You are researching whether " + \
"politicians with agricultural assets are more prone to taking actions that favor agricultural businesses. Your task is to assess whether " + \
"an election winner, their spouse, or any of their dependents, own agricultural assets. To code these variables you will be provided a csv that " + \
"sumarizes immovable assets (including agricultural properties) for three categories: self (the polititian), spouse, and dependents. " + \
"Each row lists a type of land, and the column lists the individuals. When an individual owns property of a given type, the properties are listed in the corresponding cell" + \
"Otherwise, the cell lists \'Nil\'.\n" + \
"Provide a JSON blob that briefly summarizes your reasoning for each category, as well as " + \
"an indicator variable that equals 1 if the person owns any agricultural property, 0 " + \
"if they do not, and N/A if you were unable to verify either way." + \
"ONLY use the information contained in the csv. If AT LEAST ONE of the dependents owns agricultural land, output a 1 for that category." + \
"Finally, the \"agri_profession\" indicator for the polititian and the spouse should take value 1 when the individual has a profession explicitly related to agriculture" + \
"and 0 if the individual's profession does not explicitly relate to agriculture"

ideal_response = '{\n "name": "Kudecho Khamo",\n  "election_year": "Nagaland2023",\n  "agri_assets": {\n    "self": {\n      "agri_profession": 0,\n      "reasoning": "The politician owns agricultural land in Kezharu Jotsoma, confirming ownership of agricultural property.",\n      "owns_agricultural_assets": 1\n    },\n    "spouse": {\n      "agri_profession": 0,\n      "reasoning": "There is no agricultural property listed under the spouse\'s name in the records, as reflected by the value \'Nil\'.",\n      "owns_agricultural_assets": 0\n    },\n    "dependents": {\n      "reasoning": "The \'Nil\' values indicate that none of the dependents own any agricultural assets.",\n      "owns_agricultural_assets": 0\n    }\n  }\n}'

sample_name = "Kudecho Khamo"

sample_electionyear = "Nagaland2023"

sample_csv=""",0,1,2,3,4,5,6,7,8
0,Sr No,Description,self,spouse,huf,dependent1,dependent2,dependent3,
1,i,Agricultural Land,"1- PERUMBAKKAM S.NO.41/1A ACRE 2.62 CENT 2017/04/17 RS.1415020 2- MUTHANUR S.NO.32/8A1,8A2,,32/7A7,32/6B,32/9A1A,,32/9A2,32/9A1B,32/9A1C ACRE 1.75 CENT 2020/02/05 RS.631500 3-MUTHANUR S.NO. 32/6A2A, ACRE 1.45 CENT 2020/02/10 RS.217800  Total Area Built Up Area Whether Inherited N Purchase Date 0000-00-00 Purchase Cost 0.00 Development Cost 0.00  25,00,000 25 Lacs+","OLLAKALAPADI S.NO.84/2  Total Area 1 ACRE 4.98CENT Built Up Area Whether Inherited N Purchase Date 2019-09-04 Purchase Cost 1730550.00 Development Cost 0.00  19,20,000 19 Lacs+","PERUMBAKKAM S.NO.41/1A,41/1A3A,41/1A3B  Total Area ACRE4.33 CENT Built Up Area Whether Inherited Y Purchase Date 0000-00-00 Purchase Cost 0.00 Development Cost 2215000.00  35,54,400 35 Lacs+",Nil,Nil,Nil,"Rs 79,74,400 79 Lacs+"
2,ii,Non Agricultural Land,Nil,"MAHASHAKTI NAGAR PLOT NO.12 & 13 THIRUVANNAMALAI  Total Area 2400SQFT Built Up Area Whether Inherited N Purchase Date 0000-00-00 Purchase Cost 0.00 Development Cost 0.00  27,12,000 27 Lacs+",Nil,Nil,Nil,Nil,"Rs 27,12,000 27 Lacs+"
3,iii,Commercial Buildings,Nil,Nil,Nil,Nil,Nil,Nil,Nil
4,iv,Residential Buildings,Nil,Nil,Nil,Nil,Nil,Nil,Nil
5,v,Others,Nil,Nil,Nil,Nil,Nil,Nil,Nil
6,Total Current Market Value of (i) to (v) (as per Affidavit),Total Current Market Value of (i) to (v) (as per Affidavit),"25,00,000 25 Lacs+","46,32,000 46 Lacs+","35,54,500 35 Lacs+",Nil,Nil,Nil,"Rs 1,06,86,500 1 Crore+"
7,Totals Calculated,Totals Calculated,"Rs 25,00,000 25 Lacs+","Rs 46,32,000 46 Lacs+","Rs 35,54,400 35 Lacs+",Nil,Nil,Nil,"Rs 1,06,86,400 1 Crore+"
"""

USER_TEMPLATE = "Name: {name}\nElectionYear: {election_year}\nProfession: {profession}\nSpouse profession: {sp_profession}\nImmovable Assets: {assets}"

FEW_SHOT_PROMPT = [
    {"role": "system", "content": ANALYSIS_PROMPT},
    {"role": "user", "content": USER_TEMPLATE.format(
        name=sample_name, election_year=sample_electionyear, assets=sample_csv,
        profession="Entrepreneur", sp_profession="Computer Assistant"
    )},
    {"role": "assistant", "content": ideal_response}
]

# Load Data

In [3]:
import pandas as pd
from json import loads

winners_details_folder = "../data/winner_details"
winners = pd.read_csv("../data/winners.csv").drop(columns=["Unnamed: 0"])

# Single case tests

In [None]:
win_sample = winners.sample(1, random_state=1)
name = win_sample["name"].values[0]
unique_id = win_sample["unique_id"].values[0]
election_year = unique_id.split("_")[0]

with open(os.path.join(winners_details_folder, unique_id, "immovable_assets.csv")) as f:
    immov_assets = f.read()

winner_info = pd.read_csv(
    os.path.join(winners_details_folder, unique_id, "info.csv")
)
profession = winner_info["profession"].values[0]
sp_profession = winner_info["spouse_profession"].values[0]

# immov_assets_df = pd.read_csv(
#         os.path.join(winners_details_folder, unique_id, "immovable_assets.csv")
# )

messages = FEW_SHOT_PROMPT + [{
    "role": "user",
    "content": "Your response was just what I was looking for. Now do the same with the following info\n" + \
    USER_TEMPLATE.format(
        name=name,election_year=election_year,assets=immov_assets,
        profession=profession, sp_profession=sp_profession
    )
}]
response = openai.chat.completions.create(
        messages=messages,
        model="gpt-4o-mini"
)

response_md = response.choices[0].message.content
loads(response_md)

{'name': 'Haroon Khan',
 'election_year': 'Maharashtra2024',
 'agri_assets': {'self': {'profession': 'Business',
   'reasoning': 'The politician owns agricultural land in Vagoshi Sudhagad, confirming ownership of agricultural property.',
   'owns_agricultural_assets': 1},
  'spouse': {'profession': 'Died',
   'reasoning': "There are no agricultural properties listed under the spouse's name as the value is 'Nil', indicating no ownership.",
   'owns_agricultural_assets': 0},
  'dependents': {'reasoning': "The entries show 'Nil' for dependents, indicating that none of them own agricultural assets.",
   'owns_agricultural_assets': 0}}}

In [None]:
response.usage

CompletionUsage(completion_tokens=183, prompt_tokens=1885, total_tokens=2068, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))

In [None]:
pd.read_csv(
        os.path.join(winners_details_folder, unique_id, "immovable_assets.csv")
)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,Sr No,Description,self,spouse,huf,dependent1,dependent2,dependent3,
1,1,i,Agricultural Land,"Vagoshi Sudhagad Raigad Survey No 179, 304 To...",Nil,Nil,Nil,Nil,Nil,"Rs 13,45,250 13 Lacs+"
2,2,ii,Non Agricultural Land,Land At Aundhe Khurd Maval Lonavala Pune Gat N...,Nil,Nil,Nil,Nil,Nil,"Rs 11,66,000 11 Lacs+"
3,3,iii,Commercial Buildings,Nil,Nil,Nil,Nil,Nil,Nil,Nil
4,4,iv,Residential Buildings,Sajid Tower CHS Ltd Amboli Flat No 415 4th Flo...,Nil,Nil,Nil,Nil,Nil,"Rs 2,30,96,000 2 Crore+"
5,5,v,Others,Nil,Nil,Nil,Nil,Nil,Nil,Nil
6,6,Total Current Market Value of (i) to (v) (as p...,Total Current Market Value of (i) to (v) (as p...,"2,56,07,250 2 Crore+",Nil,Nil,Nil,Nil,Nil,"Rs 2,56,07,250 2 Crore+"
7,7,Totals Calculated,Totals Calculated,"Rs 2,56,07,250 2 Crore+",Nil,Nil,Nil,Nil,Nil,"Rs 2,56,07,250 2 Crore+"


In [None]:
import numpy as np
# self: 3, spouse: 4, deps: 6, 7, 8
pd.read_csv(
        os.path.join(winners_details_folder, unique_id, "immovable_assets.csv")
).iloc[1, [6, 7, 8]].values == np.array(["Nil"] * 3)

array([ True,  True,  True])

## Analysis prompt

In [None]:
ANALYSIS_PROMPT = "You are a research assistant interested in political capture in India. Right now, you are researching whether \
politicians with agricultural assets are more prone to taking actions that favor agricultural businesses. Your first task is to assess whether \
a politician owns agricultural assets. To do this, you will be given a table that summarizes their own immovable assets, as well as those owned by \
their spouse and dependents. Summarize this information by mentioning who owns assets and their valuation"

messages = [
        {"role": "system", "content": ANALYSIS_PROMPT},
        {"role": "user", "content": USER_TEMPLATE.format(
                name=name,
                election_year=election_year,
                assets=immov_assets
        )}
]
response = openai.chat.completions.create(
        messages=messages,
        model="gpt-4o-mini"
)

response_md = response.choices[0].message.content

Markdown(response_md)

KeyError: 'profession'

# Run experiment

## Make Async Function

In [5]:
import os
import asyncio
import pandas as pd
from json import loads, JSONDecodeError
import openai
from openai import AsyncOpenAI

client = AsyncOpenAI(api_key=openai.api_key)

async def aprocess_winner(row):
    name = row["name"]
    unique_id = row["unique_id"]
    election_year = unique_id.split("_")[0]
    winner_info = pd.read_csv(
        os.path.join(winners_details_folder, unique_id, "info.csv")
    )
    with open(os.path.join(winners_details_folder, unique_id, "immovable_assets.csv")) as f:
        immov_assets = f.read()

    winner_info = pd.read_csv(
        os.path.join(winners_details_folder, unique_id, "info.csv")
    )
    profession = winner_info["profession"].values[0]
    sp_profession = winner_info["spouse_profession"].values[0]
    messages = FEW_SHOT_PROMPT + [{
        "role": "user",
        "content": "Your response was just what I was looking for. Now do the same with the following info\n" + \
        USER_TEMPLATE.format(
            name=name,election_year=election_year,assets=immov_assets,
            profession=profession, sp_profession=sp_profession
        )
    }]
    response = await client.chat.completions.create(
            messages=messages, model="gpt-4o-mini",
            response_format={"type": "json_object"}
    )

    response_md = response.choices[0].message.content
    response_tokens = response.usage
    in_tokens, out_tokens = response_tokens.prompt_tokens, response_tokens.completion_tokens
    return response_md, in_tokens, out_tokens

# async def aprocess_winner(row):
#     loop = asyncio.get_running_loop()
#     return await loop.run_in_executor(None, process_winner, row)

In [None]:
async def main():
    winners = pd.read_csv("../data/winners.csv").drop(columns=["Unnamed: 0"])
    sampled_winners = winners.sample(50, random_state=1)
    tasks = [aprocess_winner(row) for _, row in sampled_winners.iterrows()]
    results = await asyncio.gather(*tasks)
    # results = [res for res in results if res is not None]
    json_blobs = [result[0] for result in results]
    avg_prompt_tokens = sum([result[1] / 50 for result in results])
    avg_completion_tokens = sum([result[2] / 50 for result in results])
    return json_blobs, avg_prompt_tokens, avg_completion_tokens, sampled_winners

## Run and calculate time

In [None]:
import time
start_time = time.time()
json_blobs, avg_prompt_tokens, avg_completion_tokens, sampled_winners = await main()
time_to_complete = time.time() - start_time
print("--- %s seconds ---" % (time_to_complete))

--- 6.491965055465698 seconds ---


## Estimate cost of processing all the requests

In [None]:
in_token_price, out_token_price = 0.15/1e6, 0.6/1e6
avg_price_per_entry = in_token_price * avg_prompt_tokens + out_token_price * avg_completion_tokens
estimated_total_price = avg_price_per_entry * 4100
estimated_total_price

1.8626505000000002

In [None]:
start_time = time.time()
loaded_json_blobs = [loads(blob) for blob in json_blobs]
time.time() - start_time

0.0004470348358154297

In [None]:
loaded_json_blobs

[{'name': 'Haroon Khan',
  'election_year': 'Maharashtra2024',
  'agri_assets': {'self': {'agri_profession': 0,
    'reasoning': 'The politician owns agricultural land in Vagoshi Sudhagad Raigad, confirming ownership of agricultural property.',
    'owns_agricultural_assets': 1},
   'spouse': {'agri_profession': 'N/A',
    'reasoning': 'The spouse is deceased, and thus there is no way to determine ownership of agricultural assets.',
    'owns_agricultural_assets': 'N/A'},
   'dependents': {'reasoning': "The 'Nil' values indicate that none of the dependents own any agricultural assets.",
    'owns_agricultural_assets': 0}}},
 {'name': 'Saravanan.P.S.T',
  'election_year': 'TamilNadu2021',
  'agri_assets': {'self': {'agri_profession': 1,
    'reasoning': 'The politician owns significant agricultural land, confirming ownership of agricultural property.',
    'owns_agricultural_assets': 1},
   'spouse': {'agri_profession': 1,
    'reasoning': "The spouse also has no listed agricultural pro

## Testing accuracy

In [None]:
import numpy as np

def measure_individual_response_match(response):
    name = response["name"]
    self_assets_response = response["agri_assets"]["self"]["owns_agricultural_assets"]
    spouse_assets_response =  response["agri_assets"]["spouse"]["owns_agricultural_assets"]
    dependents_assets_response = response["agri_assets"]["dependents"]["owns_agricultural_assets"]
    unique_id = sampled_winners.query(f"name=='{name}'")["unique_id"].values[0]
    ground_truth_assets_table = pd.read_csv(
        os.path.join(winners_details_folder, unique_id, "immovable_assets.csv")
    )
    self_ground_truth = ground_truth_assets_table.iloc[1, 3] != "Nil"
    spouse_ground_truth = ground_truth_assets_table.iloc[1, 4] != "Nil"
    dependents_ground_truth = np.any(ground_truth_assets_table.iloc[1, [6, 7, 8]].values != np.array(["Nil"] * 3))
    self_match = self_assets_response == self_ground_truth
    spouse_match = spouse_assets_response == spouse_ground_truth
    dependents_match = dependents_assets_response == dependents_ground_truth
    return self_match, spouse_match, dependents_match

def measure_responses_accuracy(json_blobs):
    for response in json_blobs:
        yield np.array([measure_individual_response_match(response)])

matches = np.vstack([row for row in measure_responses_accuracy(loaded_json_blobs)])


In [None]:
matches.mean()

0.9533333333333334

In [None]:
matches.mean(axis=0)

array([1.  , 0.88, 0.98])

# Run all

In [None]:
import os
import asyncio
import pandas as pd
from json import loads, JSONDecodeError
import openai
from openai import AsyncOpenAI

# Initialize the async client.
client = AsyncOpenAI(api_key=openai.api_key)

# Assume these variables are defined somewhere:
# FEW_SHOT_PROMPT, USER_TEMPLATE, winners_details_folder

async def aprocess_winner(row):
    name = row["name"]
    unique_id = row["unique_id"]
    election_year = unique_id.split("_")[0]
    try:
        with open(os.path.join(winners_details_folder, unique_id, "immovable_assets.csv")) as f:
            immov_assets = f.read()

        winner_info = pd.read_csv(os.path.join(winners_details_folder, unique_id, "info.csv"))
    except FileNotFoundError as e:
        print(f"File not found for {unique_id}: {e}")
        return None

    profession = winner_info["profession"].values[0]
    sp_profession = winner_info["spouse_profession"].values[0]

    messages = FEW_SHOT_PROMPT + [{
        "role": "user",
        "content": (
            "Your response was just what I was looking for. Now do the same with the following info\n" +
            USER_TEMPLATE.format(
                name=name,
                election_year=election_year,
                assets=immov_assets,
                profession=profession,
                sp_profession=sp_profession
            )
        )
    }]

    response = await client.chat.completions.create(
        messages=messages,
        model="gpt-4o-mini",
        response_format={"type": "json_object"}
    )

    response_md = response.choices[0].message.content
    return {unique_id: response_md}

def create_batches(df, batch_size):
    """Yield DataFrame slices (batches) of size 'batch_size'."""
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i+batch_size]

async def process_batch(batch):
    """Process a single batch concurrently and return the list of responses (filtered for errors)."""
    tasks = [aprocess_winner(row) for _, row in batch.iterrows()]
    results = await asyncio.gather(*tasks)
    # Remove any None results (errors).
    return [result for result in results if result is not None]

async def main():
    super_dict = {}
    # Load winners data (adjust the path as needed).
    winners = pd.read_csv("../data/winners.csv").drop(columns=["Unnamed: 0"])
    
    # Set your desired batch size.
    i = 1
    batch_size = 200
    for batch in create_batches(winners, batch_size):
        print("Processing batch", i)
        i += 1
        results = await process_batch(batch)
        for d in results:
            for unique_id, response_md in d.items():
                try:
                    super_dict[unique_id] = loads(response_md)
                except JSONDecodeError as e:
                    print(f"JSON decode error for {unique_id}: {e}")
    return super_dict

# In a notebook (e.g. Colab), you can run:
# json_results = await main()


In [None]:
from json import dump

json_results = main()

with open("../data/all_responses.json", "w") as f:
        dump(json_results, f)

Processing batch 1
Processing batch 2
Processing batch 3
Processing batch 4
Processing batch 5
Processing batch 6
Processing batch 7
Processing batch 8
Processing batch 9
Processing batch 10
Processing batch 11
Processing batch 12
Processing batch 13
File not found for Puducherry2021_14: [Errno 2] No such file or directory: '../data/winner_details\\Puducherry2021_14\\immovable_assets.csv'
Processing batch 14
Processing batch 15
Processing batch 16
Processing batch 17
Processing batch 18
Processing batch 19
Processing batch 20
Processing batch 21
