In [1]:
import numpy as np
import pandas as pd
import re
import json
import time
from google import genai
from google.genai.types import GenerateContentConfig

from journal_config import *

In [3]:
l_journal = list(JOURNAL_DOI.keys())

In [88]:
client = genai.Client(api_key=GEMINI_API_KEY)

Check max tokens for LLM input

In [89]:
model_info = client.models.get(model="gemini-2.5-flash")
print(f"{model_info.input_token_limit=}")
print(f"{model_info.output_token_limit=}")

model_info.input_token_limit=1048576
model_info.output_token_limit=65536


Check token size for query: 
For Gemini models, a token is equivalent to about 4 characters. 100 tokens is equal to about 60-80 English words.
https://ai.google.dev/gemini-api/docs/tokens?lang=python

In [90]:
total_tokens = client.models.count_tokens(
    model="gemini-2.5-flash-lite", 
    contents=
    """
    from the following 100 rows, extract in a sentence length: the paper_title, background, research question, data and methods, main results, implications, whether it is AI related; for each row in JSON format.
    """
    +
    str(df[0:50])
)
print("total_tokens: ", total_tokens)

total_tokens:  sdk_http_response=HttpResponse(
  headers=<dict len=11>
) total_tokens=2752 cached_content_token_count=None


### AI tagging

In [47]:
batch_size = 50

for j in d_journal_doi.keys():
    df = pd.read_csv(j+'.csv')
    df = df[~df['abstract'].isna()]
    df = df[['title', 'author', 'publication_year', 'journal_name', 'abstract']]
    df_response = pd.DataFrame()
    
    for i in range(0, len(df), batch_size):
        print("batch"+str(i))
        while True:
            try:
                response = client.models.generate_content(
                    model='gemini-2.5-flash-lite',
                    config=GenerateContentConfig(
                        system_instruction="Always respond *only* in valid JSON without extra commentary.", 
                        temperature=0.2
                    ),
                    contents=
                    f"""
                    from the following {batch_size} rows, extract: the background, research_question, data, methods, main_results, implications, whether it is_AI_related, AI_related_keyword if is_AI_related is True; keep original columns ['title', 'publication_year', 'journal_name'] as identifiers in the output; do this for each row and output only JSON format.
                    """
                    +
                    str(df[i:i+batch_size])
                )
                df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))
                df_response = pd.concat([df_response, df_json], ignore_index=True)
                print(len(df_json))
                time.sleep(300)
                break
            except Exception as e:
                print("batch"+str(i)+" re-attempt")
                time.sleep(100)
        for e in df_response.title:
            print(e)
    df_response.to_csv(j+'_response.csv', index=False)

batch0


  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


50
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

  df_json = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


37
Should Derivatives Be Privileged in Bankruptcy?
Regulatory Arbitrage and Cross-Border Bank Acquisitions
Risk Overhang and Loan Portfolio Decisions: Smarter Banks, Riskier Loans?
Is Historical Cost Accounting a Panacea? Market Reactions to Accounting Rule Changes

Internal Capital Markets in Business Groups: Evidence from China
Foreclosures, House Prices, and the Real Economy
Asset Quality Misrepresentation by Financial Intermediaries
The People in Your Neighborhood: Social Interactions and Fund Manager Behavior
Mandatory Portfolio Disclosure, Stock Liquidity, and Price Informativeness
Government Intervention and Information Aggregation
CEO Preferences and Acquisitions
The Wall Street Walk when Blockholders Compete
Reaching for Yield in the Bond Market
Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Wall Street Occupations
Market Making Contracts, Firm Value, and the Impact of Regulation
Asymmetric Learning from Financial Information
Informational Frictions and Commodity 

### Tag validation

In [63]:
df_output = pd.DataFrame()
df_output = pd.concat([pd.read_csv("Journal of Accounting Research_response.csv"), pd.read_csv("Management Science_response.csv")], ignore_index=True)

In [71]:
response = client.models.generate_content(
    model='gemini-2.5-pro',
    config=GenerateContentConfig(
        system_instruction="Always respond *only* in valid JSON without extra commentary.", 
        temperature=0.8
    ),
    contents=
    f"""
    determine each of the strings in the following list is truly Artificial Intelligence related.  The  definition of Artificial Intelligence do not include broad computer science or fintech topics.  The output should be only in JSON format with two columns ['keyword_string', 'is_truly_AI'].
    """
    +
    str(list((df_output[df_output["is_AI_related"] == True])["AI_related_keyword"]))
)
print(response.text)

```json
[
  {
    "keyword_string": "LASSO",
    "is_truly_AI": false
  },
  {
    "keyword_string": "Bitcoin",
    "is_truly_AI": false
  },
  {
    "keyword_string": "High-frequency trading",
    "is_truly_AI": false
  },
  {
    "keyword_string": "High-frequency trading",
    "is_truly_AI": false
  },
  {
    "keyword_string": "neural network",
    "is_truly_AI": true
  },
  {
    "keyword_string": "tokenization",
    "is_truly_AI": false
  },
  {
    "keyword_string": "utility tokens",
    "is_truly_AI": false
  },
  {
    "keyword_string": "natural language processing",
    "is_truly_AI": true
  },
  {
    "keyword_string": "FinTech",
    "is_truly_AI": false
  },
  {
    "keyword_string": "CBDC",
    "is_truly_AI": false
  },
  {
    "keyword_string": "alternative data",
    "is_truly_AI": false
  },
  {
    "keyword_string": "machine learning",
    "is_truly_AI": true
  },
  {
    "keyword_string": "automation",
    "is_truly_AI": false
  },
  {
    "keyword_string": "text-based

In [72]:
df_validation = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))

  df_validation = pd.read_json(re.sub(r"^```json\s*|\s*```$", "", response.text, flags=re.DOTALL))


In [84]:
df_validation.drop_duplicates(subset="keyword_string", keep="first")

Unnamed: 0,keyword_string,is_truly_AI
0,LASSO,False
1,Bitcoin,False
2,High-frequency trading,False
4,neural network,True
5,tokenization,False
...,...,...
284,"algorithms, optimization, resource allocation",False
285,"analytics, data analytics, optimization, machi...",True
286,"dynamic pricing, algorithms, optimization, rea...",False
287,multiarmed bandit,True


In [86]:
df_output_validated = pd.merge(df_output, (df_validation.drop_duplicates(subset="keyword_string", keep="first")), left_on = "AI_related_keyword", right_on = "keyword_string", how = "left")

In [101]:
df_output_validated["is_truly_AI"] = df_output_validated["is_truly_AI"].fillna(False)

  df_output_validated["is_truly_AI"] = df_output_validated["is_truly_AI"].fillna(False)


In [111]:
df_output_validated.to_csv('journals_response_validated.csv', index=False)