In [3]:
import wrds
import os
import pandas as pd
import numpy as np
# from tqdm import tqdm  # Import tqdm for the progress bar

# Establish connection
conn = wrds.Connection()

Loading library list...
Done


In [4]:
# Define AI-related keywords
ai_keywords = ['Machine Learning', 'NLP', 'Natural Language Processing', 'Computer Vision', 'CV', 'Artificial Intelligence', 'AI']

# Function to calculate AI-relatedness measure
def calculate_ai_relatedness(text, keywords):
    text_lower = text.lower()
    total_words = len(text_lower.split())
    ai_count = sum(text_lower.count(keyword.lower()) for keyword in keywords)
    return ai_count / total_words if total_words > 0 else 0

In [5]:
# Define the date range for the last five years
begDate = '01/01/2023'
endDate = '12/31/2023'

In [6]:
# Fetch 10-K filings information for all companies
form10k = conn.raw_sql(f"""
                select distinct a.cik, a.fdate, a.form, a.wrdsfname, b.tickerh
                from wrdssec.wrds_forms as a 
                inner join 
                wrdssec.wciklink_names as b
                on a.cik=b.cik
                where a.form = '10-K'
                and a.fdate between '{begDate}' and '{endDate}'
                """, date_cols = ['fdate'])

form10k = form10k.sort_values(by=['cik', 'fdate'])


In [7]:
# Path to WRDS server for filings
base_path = '/wrds/sec/wrds_clean_filings'

# Extract text content from filings
textLst = []
file_errors = []  # List to store errors for debugging
ai_relatedness_scores = []  # List to store AI-relatedness scores

In [8]:
for item in form10k['wrdsfname']:
    wrdsLoc = os.path.join(base_path, item)
#    print(f"Checking file: {wrdsLoc}")  # Debug statement
    if os.path.exists(wrdsLoc):
        try:
            with open(wrdsLoc) as f:
                text = f.read()
                textLst.append(text)
                ai_relatedness_scores.append(calculate_ai_relatedness(text, ai_keywords))
        except Exception as e:
            error_message = f"Error reading file {wrdsLoc}: {e}"
            print(error_message)
            file_errors.append(error_message)
            ai_relatedness_scores.append(np.nan)  # Append NaN for missing files
    else:
        error_message = f"File not found on server: {wrdsLoc}"
        print(error_message)
        file_errors.append(error_message)
        ai_relatedness_scores.append(np.nan)  # Append NaN for missing files
        
len(textLst)

# Output the errors for further analysis
for error in file_errors:
    print(error)


File not found on server: /wrds/sec/wrds_clean_filings/000189/1895262/0001628280-23-007260.txt
File not found on server: /wrds/sec/wrds_clean_filings/000189/1895262/0001628280-23-007260.txt


In [9]:
# Ensure the length of AI-relatedness scores matches the DataFrame length
if len(ai_relatedness_scores) == len(form10k):
    form10k['AI_Relatedness'] = ai_relatedness_scores
else:
    print("Mismatch between the number of AI-relatedness scores and DataFrame rows.")


In [10]:
# Bin the companies into equal quintiles based on their AI-relatedness measure
form10k['AI_Relatedness_Quintile'] = pd.qcut(form10k['AI_Relatedness'], 5, labels=False) + 1

In [11]:
# Display the updated DataFrame
print(form10k.head())

          cik      fdate  form                             wrdsfname tickerh  \
0  0000001750 2023-07-18  10-K  000000/1750/0001104659-23-082069.txt     AIR   
1  0000001800 2023-02-17  10-K  000000/1800/0001628280-23-004026.txt     ABT   
2  0000001961 2023-03-31  10-K  000000/1961/0001264931-23-000006.txt    WDDD   
3  0000002098 2023-03-10  10-K  000000/2098/0001564590-23-003422.txt     ACU   
4  0000002178 2023-03-16  10-K  000000/2178/0000002178-23-000038.txt      AE   

   AI_Relatedness  AI_Relatedness_Quintile  
0        0.021989                      5.0  
1        0.019663                      5.0  
2        0.018132                      4.0  
3        0.015954                      2.0  
4        0.019059                      4.0  


In [12]:
form10k.head()

Unnamed: 0,cik,fdate,form,wrdsfname,tickerh,AI_Relatedness,AI_Relatedness_Quintile
0,1750,2023-07-18,10-K,000000/1750/0001104659-23-082069.txt,AIR,0.021989,5.0
1,1800,2023-02-17,10-K,000000/1800/0001628280-23-004026.txt,ABT,0.019663,5.0
2,1961,2023-03-31,10-K,000000/1961/0001264931-23-000006.txt,WDDD,0.018132,4.0
3,2098,2023-03-10,10-K,000000/2098/0001564590-23-003422.txt,ACU,0.015954,2.0
4,2178,2023-03-16,10-K,000000/2178/0000002178-23-000038.txt,AE,0.019059,4.0


In [13]:
# Aggregate the AI-relatedness measure by quintile
quintile_summary = form10k.groupby('AI_Relatedness_Quintile')['AI_Relatedness'].mean().reset_index()
quintile_summary.columns = ['Quintile', 'Average_AI_Relatedness']

# Display the quintile summary
print(quintile_summary)


   Quintile  Average_AI_Relatedness
0       1.0                0.009343
1       2.0                0.015538
2       3.0                0.017231
3       4.0                0.018733
4       5.0                0.021970


## AI-Relatedness Measure

The AI-relatedness measure \( wAI \) is calculated using the following formula:

$$
wAI = \frac{\text{Total AI-related keywords}}{\text{Total words in the cleaned 10-K filing}}
$$

Where:

- **Total AI-related keywords**: The count of occurrences of AI-related keywords (e.g., Machine Learning, NLP, CV, AI) in the 10-K filing.
- **Total words**: The total number of words in the cleaned 10-K filing.
