# Get reports-2017-2019.xlsx 

### save to “df_companies"

In [22]:
import pandas as pd

# 读取文件中的 "companies" sheet
df_companies = pd.read_excel("reports-2017-2019.xlsx", sheet_name="annual 2017-2019")


# 查看前几行，确认结构
df_companies.head()


Unnamed: 0,conml,predicted_report_year,predicted_report_type,loc,GICS_level_1,GICS_level_2,pdf_local_path,pdfurl,n_tokenised_sentences,n_predicted_initiatives,clean_conml
0,1&1 AG,2017,annual report,,,,/srv/data/mrei/davinci/davinci_data_v6_crawl/p...,https://imagepool.1und1.ag/v2/download/bericht...,6185.0,0.0,11ag
1,1&1 AG,2018,annual report,,,,/srv/data/mrei/davinci/davinci_data_v6_crawl/p...,https://imagepool.1und1.ag/v2/download/bericht...,7222.0,0.0,11ag
2,1&1 AG,2019,annual report,,,,/srv/data/mrei/davinci/davinci_data_v6_crawl/p...,https://imagepool.1und1.ag/v2/download/bericht...,6387.0,0.0,11ag
3,3I Group PLC,2017,annual report,GBR,Financials,Financial Services,/srv/data/mrei/davinci/davinci_data_v6_crawl/p...,https://www.3i.com/media/1849/3i-group-ar-2017...,7533.0,1.0,3i
4,3I Group PLC,2018,annual report,GBR,Financials,Financial Services,/srv/data/mrei/davinci/davinci_data_v6_crawl/p...,https://www.3i.com/media/jx2hhaxx/3i-group-ar-...,6399.0,1.0,3i


### read "conml" column

In [23]:
# 提取所有非空且唯一的公司名，放入一个列表变量中
company_full_names = df_companies['conml'].dropna().unique().tolist()
# 打印公司名列表的长度和前10个公司名
print(f"Number of unique company names: {len(company_full_names)}")
print("First 10 company names:", company_full_names[:10])


Number of unique company names: 198
First 10 company names: ['1&1 AG', '3I Group PLC', 'Abengoa SA', 'AIA Group Limited', 'Air Canada', 'Aixtron SE', 'Aker ASA', 'Aker Solutions ASA', 'Amundi SA', 'Antofagasta PLC']


### create empty DataFrame

In [24]:
import pandas as pd

# 初始化元数据表
company_metadata = pd.DataFrame({'company_name': company_full_names})

# 添加你要补全的字段（先设为空）
fields = ['sector', 'industry', 'country', 'ticker', 'website', 'ISIN','Number of employees']
for field in fields:
    company_metadata[field] = None


# yfinance

### use yfinance to find ticker

In [25]:
import requests

def guess_ticker(company_name):
    url = f"https://query2.finance.yahoo.com/v1/finance/search?q={company_name}"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    try:
        r = requests.get(url, headers=headers)  # 加 headers
        data = r.json()
        if data['quotes']:
            return data['quotes'][0]['symbol']
    except Exception as e:
        print(f"Error for {company_name}: {e}")
    return None


In [27]:
import time

# 遍历 company_metadata 表，为每家公司匹配 ticker
for i, row in company_metadata.iterrows():
    company_name = row['company_name']
    ticker = guess_ticker(company_name)
    company_metadata.at[i, 'ticker'] = ticker
    print(f"{i+1}/{len(company_metadata)}: {company_name} -> {ticker}")
    
    # 防止触发 Yahoo 限速限制（建议加个小延迟）
    time.sleep(0.5)


1/198: 1&1 AG -> SB=F
2/198: 3I Group PLC -> TGOPY
3/198: Abengoa SA -> AGOAF
4/198: AIA Group Limited -> 7A20.F
5/198: Air Canada -> AC.TO
6/198: Aixtron SE -> AIXAD.XC
7/198: Aker ASA -> FKM.F
8/198: Aker Solutions ASA -> 1AKA.F
9/198: Amundi SA -> AMUN.VI
10/198: Antofagasta PLC -> ANFGF
11/198: Antofagasta plc -> ANFGF
12/198: Arcelormittal SA -> MT.AS
13/198: ArcelorMittal SA -> MT.AS
14/198: Aroundtown SA -> 0RUH.IL
15/198: AstraZeneca PLC -> AZN
16/198: Atea ASA -> MKL.F
17/198: Atos SE -> ATO.PA
18/198: AT&T Inc -> T
19/198: AT&T Corp -> T
20/198: Aurizon Holdings Ltd -> QRL.F
21/198: Aviva plc -> AV.L
22/198: Banco Santander, S.A. -> SAN
23/198: Banco Santander ( Brasil ) S.A. -> BSBR
24/198: BANDAI NAMCO Holdings Inc -> 7832.T
25/198: Barrick Gold Corp -> ABR.BE
26/198: BCE Inc. -> BCE
27/198: Beach Energy Ltd -> BCHEY
28/198: Bertelsmann SE & Co. KGaA -> BTG4.F
29/198: Bertelsmann SE & Co KGaA -> BTG4.F
30/198: bioMérieux -> None
31/198: bioMérieux S.A. -> None
32/198: Black

### ticker saved in "company_metadata.csv"

In [None]:
company_metadata.csvcompany_metadata.to_csv("", index=False)


check missing

In [29]:
# 检查有多少行 ticker 是空的
missing_ticker_count = company_metadata['ticker'].isna().sum()
print(f"Companies without ticker: {missing_ticker_count} / {len(company_metadata)}")


Companies without ticker: 32 / 198


### use yfinance to fill sector, country, website, etc.

In [30]:
import yfinance as yf

def get_yfinance_info(ticker):
    try:
        info = yf.Ticker(ticker).info
        return {
            'sector': info.get('sector'),
            'industry': info.get('industry'),
            'country': info.get('country'),
            'website': info.get('website'),
            'ISIN': info.get('isin'),
            'Number of employees': info.get('fullTimeEmployees')
        }
    except Exception as e:
        print(f"Error getting info for {ticker}: {e}")
        return {}


In [31]:
import time

for i, row in company_metadata.iterrows():
    ticker = row['ticker']
    if pd.notna(ticker):
        meta = get_yfinance_info(ticker)
        for key in ['sector','industry', 'country', 'website','ISIN', 'Number of employees']:
            company_metadata.at[i, key] = meta.get(key)
        print(f"{i+1}/{len(company_metadata)}: {row['company_name']} get")
        time.sleep(0.5)  # 防止被限速


1/198: 1&1 AG get
2/198: 3I Group PLC get
3/198: Abengoa SA get
4/198: AIA Group Limited get
5/198: Air Canada get
6/198: Aixtron SE get
7/198: Aker ASA get
8/198: Aker Solutions ASA get
9/198: Amundi SA get
10/198: Antofagasta PLC get
11/198: Antofagasta plc get
12/198: Arcelormittal SA get
13/198: ArcelorMittal SA get
14/198: Aroundtown SA get
15/198: AstraZeneca PLC get
16/198: Atea ASA get
17/198: Atos SE get
18/198: AT&T Inc get
19/198: AT&T Corp get
20/198: Aurizon Holdings Ltd get
21/198: Aviva plc get
22/198: Banco Santander, S.A. get
23/198: Banco Santander ( Brasil ) S.A. get
24/198: BANDAI NAMCO Holdings Inc get
25/198: Barrick Gold Corp get
26/198: BCE Inc. get
27/198: Beach Energy Ltd get
28/198: Bertelsmann SE & Co. KGaA get
29/198: Bertelsmann SE & Co KGaA get
34/198: Boston Scientific Corporation get
35/198: Brambles Ltd get
36/198: Bucher Industries AG, Niederweningen get
37/198: Catella get
38/198: Cathay Pacific Airways Ltd get
39/198: Cathay Pacific Airways Limited 

add a column source as: yfinance

In [32]:
company_metadata['source'] = company_metadata['sector'].apply(
    lambda x: 'yfinance' if pd.notna(x) and str(x).strip() != "" else None
)


add a column as: status

If any required field is null (NaN) or an empty string, it is marked as 'missing'  

Only when all 6 fields have valid values ​​is it marked as 'matched'

In [33]:
# 明确列出你要检查的字段（不包括 ISIN）
required_fields = ['sector', 'industry', 'country', 'ticker', 'website', 'Number of employees']

def lookup_status(row):
    for field in required_fields:
        value = row.get(field)
        if pd.isna(value) or str(value).strip() == "":
            return 'missing'
    return 'matched'

company_metadata['lookup_status'] = company_metadata.apply(lookup_status, axis=1)


In [34]:
missing_count = company_metadata[company_metadata['lookup_status'] == 'missing'].shape[0]
print(f"Number of companies with missing metadata: {missing_count}")


Number of companies with missing metadata: 63


In [35]:
match_count = company_metadata[company_metadata['lookup_status'] == 'matched'].shape[0]
print(f"Number of companies with matched metadata: {match_count}")

Number of companies with matched metadata: 135


In [36]:
company_metadata.to_csv("company_metadata.csv", index=False)

### save all the lookup_status == 'missing' row to "company_metadata_missing_only.csv"

In [37]:
missing_rows = company_metadata[company_metadata['lookup_status'] == 'missing']
missing_rows.to_csv("company_metadata_missing_only.csv", index=False)

# OpenCorporates

### use open corporates to fill the table：

only use "company_metadata_missing_only.csv"

check if missing, if true, add country, website, source

In [38]:
import pandas as pd
import time
import requests
import os
from dotenv import load_dotenv

# 加载 API Key
load_dotenv()
API_KEY = os.getenv("OPENCORP_API_KEY")

# 读取只包含 missing 的公司表
missing_df = pd.read_csv("company_metadata_missing_only.csv")

# 定义调用 OpenCorporates API 的函数
def search_company_opencorp(name):
    url = f"https://api.opencorporates.com/v0.4/companies/search?q={name}&api_token={API_KEY}"
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return {}
        data = response.json()
        if data["results"]["companies"]:
            c = data["results"]["companies"][0]["company"]
            return {
                "country": c.get("jurisdiction_code", "").upper(),
                "website": c.get("homepage_url", ""),
                "source": "opencorporates"
            }
    except Exception as e:
        print(f"Error for {name}: {e}")
    return {}

# 遍历每一行进行补充
for i, row in missing_df.iterrows():
    if pd.isna(row['country']) or pd.isna(row['website']) or str(row['country']).strip() == "" or str(row['website']).strip() == "":
        result = search_company_opencorp(row['company_name'])
        if result:
            if result.get("country"):
                missing_df.at[i, 'country'] = result["country"]
            if result.get("website"):
                missing_df.at[i, 'website'] = result["website"]
            missing_df.at[i, 'source'] = result["source"]
        print(f"{i+1}/{len(missing_df)}: {row['company_name']} get")
        time.sleep(1.2)




1/63: 1&1 AG get
4/63: Barrick Gold Corp get
8/63: bioMérieux get
9/63: bioMérieux S.A. get
10/63: Blackmores Limited get
11/63: Bobst Group SA get
13/63: China National Petroleum Corporation get
14/63: Cnooc Ltd get
16/63: CRRC Corporation Ltd get
17/63: CRRC Group get
18/63: De’Longhi S.p.A. get
19/63: Fenix Outdoor AB get
21/63: Hannover Rück SE get
22/63: HeidelbergCement AG get
23/63: Hong Kong Exchanges and Clearing Ltd get
24/63: HSBC Bank Canada get
25/63: Hydro-Québec get
26/63: IEA Bioenergy get
28/63: Kordsa Teknik Tekstil A S get
30/63: MİGROS TİCARET A.Ş. get
33/63: Monash Health get
35/63: Nestle SA, Cham Und Vevey get
36/63: Netcompany A/S get
38/63: Ping An Bank Co Ltd get
39/63: Preem Norge As get
40/63: Schroders get
41/63: Sentara Healthcare get
42/63: Sentara Hospitals get
47/63: Sun Pharmaceutical Industries Ltd. get
50/63: Telstra Corporation Limited get
51/63: Telstra Corporation Ltd get
52/63: UnipolSai Assicurazioni S.p.A. get
53/63: UnipolSai Assicurazioni get

Expand country abbreviations to full names

In [39]:
import pycountry

def convert_country_code(code):
    if not code or not isinstance(code, str):
        return code
    code = code.strip().split("_")[0]
    country = pycountry.countries.get(alpha_2=code.upper())
    return country.name if country else code


In [40]:
missing_df['country'] = missing_df['country'].apply(convert_country_code)

save to "company_metadata_missing_updated.csv"

In [41]:
# 保存更新后的文件
missing_df.to_csv("company_metadata_missing_updated.csv", index=False)

In [42]:
# 读取主表和更新的缺失子表
company_metadata = pd.read_csv("company_metadata.csv")
updated_missing = pd.read_csv("company_metadata_missing_updated.csv")

combine company_metadata_missing_updated.csv to company_metadata.csv

In [43]:
# 合并两个表（只保留两列用于比较）
merged = pd.merge(
    company_metadata[['company_name', 'country', 'website']],
    updated_missing [['company_name', 'country', 'website']],
    on='company_name',
    suffixes=('_main', '_updated')
)

# 检查两个字段的冲突（都非空且不同）
conflict_rows = merged[
    (
        merged['country_main'].notna() & merged['country_updated'].notna() &
        (merged['country_main'].str.strip() != merged['country_updated'].str.strip())
    ) |
    (
        merged['website_main'].notna() & merged['website_updated'].notna() &
        (merged['website_main'].str.strip() != merged['website_updated'].str.strip())
    )
]

# 查看冲突公司
print(f"Number of conflicts: {len(conflict_rows)}")
print(conflict_rows[['company_name', 'country_main', 'country_updated', 'website_main', 'website_updated']])

# 可选：保存冲突报告
conflict_rows.to_csv("country_website_conflicts.csv", index=False)

Number of conflicts: 0
Empty DataFrame
Columns: [company_name, country_main, country_updated, website_main, website_updated]
Index: []


In [44]:
# 根据 company_name 进行合并更新（以 updated_missing 为准）
company_metadata.set_index('company_name', inplace=True)
updated_missing.set_index('company_name', inplace=True)

# 用更新后的字段覆盖原表中的相应行（只会改掉有值的）
company_metadata.update(updated_missing)

# 恢复索引
company_metadata.reset_index(inplace=True)

# 保存合并后的总表
company_metadata.to_csv("company_metadata.csv", index=False)

still missing

In [45]:
# 查看每列缺失值数量
missing_counts = company_metadata.isna().sum()

# 打印结果
print("各列缺失值统计：")
print(missing_counts)


各列缺失值统计：
company_name             0
sector                  46
industry                46
country                 11
ticker                  32
website                 41
ISIN                   198
Number of employees     59
source                  16
lookup_status            0
dtype: int64


# open figi

In [55]:
import os
import pandas as pd
import requests
from dotenv import load_dotenv
from time import sleep
from tqdm import tqdm

# 加载 API Key
load_dotenv()
API_KEY = os.getenv("OPENFIGI_API_KEY")
HEADERS = {
    'Content-Type': 'application/json',
    'X-OPENFIGI-APIKEY': API_KEY
}

# 读取公司列表
company_n = pd.read_csv("company_metadata.csv")

# 查询函数
import requests

def query_figi_by_name(name):
    url = "https://api.openfigi.com/v3/search"
    payload = {"query": name}
    try:
        response = requests.post(url, headers=HEADERS, json=payload)
        if response.status_code == 200:
            data = response.json()
            results = data.get("data", [])
            if results:
                first_result = results[0]
                ticker = first_result.get("ticker", "")
                composite_figi = first_result.get("compositeFIGI", "")

                # 尝试获取真正的 ISIN
                isin = ""
                security_identifiers = first_result.get("securityIdentifiers", [])
                for ident in security_identifiers:
                    if ident.get("idType") == "ID_ISIN":
                        isin = ident.get("identifier", "")
                        break

                return ticker, composite_figi, isin
        return "", "", ""
    except Exception as e:
        print(f"Error querying {name}: {e}")
        return "", "", ""


# 批量查找并加进度条
tickers, figis, isins = [], [], []

print("⏳ 正在查找公司 Ticker、FIGI 和 ISIN ...")
for name in tqdm(company_n['company_name'], desc="Querying OpenFIGI"):
    ticker, figi, isin = query_figi_by_name(name)
    tickers.append(ticker)
    figis.append(figi)
    isins.append(isin)
    sleep(0.7)  # 控制请求频率，避免被限流

# 写入结果
company_n['ticker'] = tickers
company_n['figi'] = figis
company_n['isin'] = isins
company_n['lookup_status'] = ['done' if t or i else 'not found' for t, i in zip(tickers, isins)]


⏳ 正在查找公司 Ticker、FIGI 和 ISIN ...


Querying OpenFIGI: 100%|██████████| 198/198 [03:34<00:00,  1.08s/it]


In [60]:
# 保存结果
company_n.to_csv("company_list_filled.csv", index=True)

# 缺失值统计（统一为小写列名）
missing_ticker = (company_n['ticker'].astype(str).str.strip() == "").sum()
missing_figi   = (company_n['figi'].astype(str).str.strip() == "").sum()
missing_isin   = (company_n['isin'].astype(str).str.strip() == "").sum()

# 输出结果
print(f"\n✅ 查找完成，统计如下：")
print(f" - 缺失 Ticker 的公司数：{missing_ticker}")
print(f" - 缺失 FIGI 的公司数：{missing_figi}")
print(f" - 缺失 ISIN 的公司数：{missing_isin}")
print(f" - 总公司数：{len(company_n)}")



✅ 查找完成，统计如下：
 - 缺失 Ticker 的公司数：130
 - 缺失 FIGI 的公司数：130
 - 缺失 ISIN 的公司数：198
 - 总公司数：198


In [58]:
import os
import pandas as pd
import requests
from dotenv import load_dotenv
from time import sleep
from tqdm import tqdm

# 加载 API Key
load_dotenv()
API_KEY = os.getenv("OPENFIGI_API_KEY")
HEADERS = {
    'Content-Type': 'application/json',
    'X-OPENFIGI-APIKEY': API_KEY
}

# 只读取公司名列（防止加载多余列）
company_list = pd.read_csv("company_metadata.csv", usecols=["company_name"])

# 查询函数
def query_figi_by_name(name):
    url = "https://api.openfigi.com/v3/search"
    payload = {"query": name}
    try:
        response = requests.post(url, headers=HEADERS, json=payload)
        if response.status_code == 200:
            data = response.json()
            results = data.get("data", [])
            if results:
                first_result = results[0]
                ticker = first_result.get("ticker", "")
                composite_figi = first_result.get("compositeFIGI", "")

                # 获取 ISIN
                isin = ""
                security_identifiers = first_result.get("securityIdentifiers", [])
                for ident in security_identifiers:
                    if ident.get("idType") == "ID_ISIN":
                        isin = ident.get("identifier", "")
                        break

                return ticker, composite_figi, isin
        return "", "", ""
    except Exception as e:
        print(f"Error querying {name}: {e}")
        return "", "", ""

# 批量查找
tickers, figis, isins = [], [], []

print("⏳ 正在查找公司 Ticker、FIGI 和 ISIN ...")
for name in tqdm(company_list['company_name'], desc="Querying OpenFIGI"):
    ticker, figi, isin = query_figi_by_name(name)
    tickers.append(ticker)
    figis.append(figi)
    isins.append(isin)
    sleep(0.7)

# 构造新 DataFrame
result_df = pd.DataFrame({
    "company_name": company_list['company_name'],
    "ticker": tickers,
    "figi": figis,
    "isin": isins
})




⏳ 正在查找公司 Ticker、FIGI 和 ISIN ...


Querying OpenFIGI: 100%|██████████| 198/198 [03:30<00:00,  1.06s/it]


In [61]:
# 保存结果
result_df.to_csv("company_figi_result.csv", index=True)

# 缺失值统计
missing_ticker = (result_df['ticker'].astype(str).str.strip() == "").sum()
missing_figi = (result_df['figi'].astype(str).str.strip() == "").sum()
missing_isin = (result_df['isin'].astype(str).str.strip() == "").sum()

print(f"\n✅ 查找完成，统计如下：")
print(f" - 缺失 Ticker 的公司数：{missing_ticker}")
print(f" - 缺失 FIGI 的公司数：{missing_figi}")
print(f" - 缺失 ISIN 的公司数：{missing_isin}")
print(f" - 总公司数：{len(result_df)}")


✅ 查找完成，统计如下：
 - 缺失 Ticker 的公司数：128
 - 缺失 FIGI 的公司数：128
 - 缺失 ISIN 的公司数：198
 - 总公司数：198


In [62]:
import pandas as pd

# 读取两个文件
figi_df = pd.read_csv("company_figi_result.csv")
yahoo_df = pd.read_csv("company_metadata.csv")

# 只保留公司名和 ticker 列
figi_df = figi_df[["company_name", "ticker"]].rename(columns={"ticker": "figi_ticker"})
yahoo_df = yahoo_df[["company_name", "ticker"]].rename(columns={"ticker": "yahoo_ticker"})

# 合并两个来源的数据
merged = pd.merge(figi_df, yahoo_df, on="company_name", how="outer")

# 比较是否一致
merged["ticker_match"] = merged["figi_ticker"].fillna("").str.strip().str.upper() == \
                         merged["yahoo_ticker"].fillna("").str.strip().str.upper()

# 显示冲突项（不一致且两个都不为空）
conflicts = merged[
    (~merged["ticker_match"]) &
    (merged["figi_ticker"].notna()) & (merged["figi_ticker"].astype(str).str.strip() != "") &
    (merged["yahoo_ticker"].notna()) & (merged["yahoo_ticker"].astype(str).str.strip() != "")
]

# 输出冲突结果
print(f"\n🔍 发现 ticker 冲突的公司数：{len(conflicts)}")
print(conflicts[["company_name", "figi_ticker", "yahoo_ticker"]].head())

# 也可以保存冲突表
conflicts.to_csv("ticker_conflicts.csv", index=False)



🔍 发现 ticker 冲突的公司数：61
        company_name                figi_ticker yahoo_ticker
0             1&1 AG                        DRI         SB=F
1       3I Group PLC  IIILN 6.875 03/09/23 EMTN        TGOPY
2  AIA Group Limited                      13469       7A20.F
3          AT&T Corp        T 7.3 11/15/11 REGS            T
4           AT&T Inc             T 5.1 09/15/14            T


# Financial Table

# use CIK to search in SEC EDGAR

In [66]:
import pandas as pd
import json
from difflib import get_close_matches

# 1. 加载 SEC 提供的 Ticker→CIK 数据
with open("/Users/lori/IRP/company_tickers.json", 'r') as f:
    sec_data = json.load(f)

# 2. 整理成 DataFrame（键是 CIK）
sec_records = []
for cik_str, info in sec_data.items():
    sec_records.append({
        'cik': str(cik_str).zfill(10),
        'ticker': info.get('ticker', '').upper(),
        'sec_name': info.get('title', '').strip().lower()
    })

sec_df = pd.DataFrame(sec_records)

# 3. 读取你的公司元数据 CSV（替换成你用的那个）
your_df = pd.read_csv("company_metadata.csv")
your_df['ticker'] = your_df['ticker'].astype(str).str.upper()
your_df['company_name'] = your_df['company_name'].astype(str).str.lower()

# 4. 通过 ticker 匹配
merged_df = pd.merge(
    your_df, sec_df,
    how='left',
    on='ticker'
)

# 5. 如果没有通过 ticker 匹配，再尝试公司名模糊匹配
for idx, row in merged_df[merged_df['cik'].isna()].iterrows():
    name = row['company_name']
    match = get_close_matches(name, sec_df['sec_name'], n=1, cutoff=0.85)
    if match:
        matched_row = sec_df[sec_df['sec_name'] == match[0]].iloc[0]
        merged_df.at[idx, 'cik'] = matched_row['cik']

# 6. 提取需要的列
final_df = merged_df[['company_name', 'ticker', 'cik']]




In [67]:
# 7. 保存为新文件
output_path = "company_with_cik.csv"
final_df.to_csv(output_path, index=False)
print(f" finished: CIK saved to {output_path}")

# 8. 统计匹配情况
missing = final_df['cik'].isna().sum()
print(f"🔍 no. of missing CIK :{missing} / {len(final_df)}")

 finished: CIK saved to company_with_cik.csv
🔍 no. of missing CIK :103 / 198


below are 2021-2025 data

ticker，year，total revenue,source

In [None]:
import pandas as pd
import yfinance as yf
import time

# 读取公司表
df_meta = pd.read_csv("company_metadata.csv")

# 创建空列表保存结果
revenue_data = []

# 遍历每个 ticker
for i, row in df_meta.iterrows():
    ticker = row.get("ticker")
    company_id = row.get("company_id")

    if pd.isna(ticker):
        continue  # 没有 ticker 的跳过

    try:
        yf_ticker = yf.Ticker(ticker)
        financials = yf_ticker.financials.T  # 年份为行，指标为列
        financials.reset_index(inplace=True)
        financials.rename(columns={"index": "Fiscal Year"}, inplace=True)

        for _, fin_row in financials.iterrows():
            fiscal_year = str(fin_row["Fiscal Year"])[:4]
            revenue = fin_row.get("Total Revenue")
            if pd.notna(revenue):
                revenue_data.append({
                    "company_id": company_id,
                    "ticker": ticker,
                    "fiscal_year": fiscal_year,
                    "total_revenue": revenue,
                    "source": "yfinance"
                })

        print(f"{i+1}/{len(df_meta)}: {ticker} ✓")
        time.sleep(0.5)  # 控速防止被封 IP
    except Exception as e:
        print(f"{i+1}/{len(df_meta)}: {ticker} ❌ Error: {e}")

# 转为 DataFrame
df_revenue = pd.DataFrame(revenue_data)

# 保存为 CSV
df_revenue.to_csv("company_revenue_yfinance.csv", index=False)


1/198: SB=F ✓
2/198: TGOPY ✓
3/198: AGOAF ✓
4/198: 7A20.F ✓
5/198: AC.TO ✓
6/198: AIXAD.XC ✓
7/198: FKM.F ✓
8/198: AKSO.OL ✓
9/198: AMUN.VI ✓
10/198: FG1.F ✓
11/198: FG1.F ✓
12/198: ARRD.HM ✓
13/198: ARRD.HM ✓
14/198: 0RUH.IL ✓
15/198: AZN ✓
16/198: MKL.F ✓
17/198: AEXAD ✓
18/198: T ✓
19/198: T ✓
20/198: QRL.F ✓
21/198: AV.L ✓
22/198: SAN ✓
23/198: BSBR ✓
24/198: 7832.T ✓
25/198: ABR.BE ✓
26/198: BCE.TO ✓
27/198: BCHEY ✓
28/198: BTG4.F ✓
29/198: BTG4.F ✓
34/198: BSX ✓
35/198: R1H.F ✓
36/198: BUHA.F ✓
37/198: SKVB.F ✓
38/198: CTYA.F ✓
39/198: CPCAF ✓
40/198: CWC.F ✓
41/198: CGF.AX ✓
42/198: GK9.F ✓
43/198: CVX ✓
44/198: CVX ✓
45/198: 2610.TW ✓
46/198: CGMBF ✓
48/198: C ✓
50/198: CBH.F ✓
51/198: CBHD.SG ✓
52/198: ED ✓
53/198: FCG.NZ ✓
54/198: 9FM.F ✓
55/198: CRH ✓
56/198: CRH ✓
57/198: 6CMB.F ✓
58/198: 6CMB.F ✓
59/198: CMW.AX ✓
63/198: DLN.F ✓
64/198: DB ✓
65/198: DTEGY ✓
66/198: DEO ✓
67/198: DEO ✓
68/198: DS5.F ✓
69/198: GIL.VI ✓
70/198: D ✓
71/198: D ✓
72/198: KABN.F ✓
73/198: DNE.F ✓

有ticker,财年，total revenue, net income,net income margin,r&d,total liability, long term debt,cash, capex,source

In [None]:
import pandas as pd
import yfinance as yf
import time

# 读取公司表
df_meta = pd.read_csv("company_metadata.csv")

# 创建空列表保存结果
revenue_data = []

for i, row in df_meta.iterrows():
    ticker = row.get("ticker")
    company_id = row.get("company_id")

    if pd.isna(ticker):
        continue  # 没有 ticker 的跳过

    try:
        yf_ticker = yf.Ticker(ticker)

        # 提取利润表、资产负债表、现金流量表
        financials = yf_ticker.financials.T
        balance_sheet = yf_ticker.balance_sheet.T
        cashflow = yf_ticker.cashflow.T

        # 重命名时间列
        financials.reset_index(inplace=True)
        balance_sheet.reset_index(inplace=True)
        cashflow.reset_index(inplace=True)
        financials.rename(columns={"index": "Fiscal Year"}, inplace=True)
        balance_sheet.rename(columns={"index": "Fiscal Year"}, inplace=True)
        cashflow.rename(columns={"index": "Fiscal Year"}, inplace=True)

        for _, fin_row in financials.iterrows():
            fiscal_year = str(fin_row["Fiscal Year"])[:4]

            revenue = fin_row.get("Total Revenue")
            net_income = fin_row.get("Net Income")
            r_and_d = fin_row.get("Research Development")

            # 派生字段：利润率
            net_income_margin = net_income / revenue if pd.notna(revenue) and revenue != 0 else None

            # 对应年份资产负债表数据
            bs_row = balance_sheet[balance_sheet["Fiscal Year"].astype(str).str.startswith(fiscal_year)]
            total_liabilities = long_term_debt = cash = None
            if not bs_row.empty:
                bs = bs_row.iloc[0]
                total_liabilities = bs.get("Total Liab")
                long_term_debt = bs.get("Long Term Debt")
                cash = bs.get("Cash And Cash Equivalents")

            # 对应年份现金流数据
            cf_row = cashflow[cashflow["Fiscal Year"].astype(str).str.startswith(fiscal_year)]
            capex = None
            if not cf_row.empty:
                capex = cf_row.iloc[0].get("Capital Expenditures")

            # 添加数据
            if pd.notna(revenue):
                revenue_data.append({
                    "company_id": company_id,
                    "ticker": ticker,
                    "fiscal_year": fiscal_year,
                    "total_revenue": revenue,
                    "net_income": net_income,
                    "net_income_margin": net_income_margin,
                    "r_and_d": r_and_d,
                    "total_liabilities": total_liabilities,
                    "long_term_debt": long_term_debt,
                    "cash_and_cash_equivalents": cash,
                    "capex": capex,
                    "source": "yfinance"
                })

        print(f"{i+1}/{len(df_meta)}: {ticker} ✓")
        time.sleep(0.5)

    except Exception as e:
        print(f"{i+1}/{len(df_meta)}: {ticker} ❌ Error: {e}")

# 转为 DataFrame
df_revenue = pd.DataFrame(revenue_data)

# 保存为 CSV
df_revenue.to_csv("company_revenue_extended.csv", index=False)


1/198: SB=F ✓
2/198: TGOPY ✓
3/198: AGOAF ✓
4/198: 7A20.F ✓
5/198: AC.TO ✓
6/198: AIXAD.XC ✓
7/198: FKM.F ✓
8/198: AKSO.OL ✓
9/198: AMUN.VI ✓
10/198: FG1.F ✓
11/198: FG1.F ✓
12/198: ARRD.HM ✓
13/198: ARRD.HM ✓
14/198: 0RUH.IL ✓
15/198: AZN ✓
16/198: MKL.F ✓
17/198: AEXAD ✓
18/198: T ✓
19/198: T ✓
20/198: QRL.F ✓
21/198: AV.L ✓
22/198: SAN ✓
23/198: BSBR ✓
24/198: 7832.T ✓
25/198: ABR.BE ✓
26/198: BCE.TO ✓
27/198: BCHEY ✓
28/198: BTG4.F ✓
29/198: BTG4.F ✓
34/198: BSX ✓
35/198: R1H.F ✓
36/198: BUHA.F ✓
37/198: SKVB.F ✓
38/198: CTYA.F ✓
39/198: CPCAF ✓
40/198: CWC.F ✓
41/198: CGF.AX ✓
42/198: GK9.F ✓
43/198: CVX ✓
44/198: CVX ✓
45/198: 2610.TW ✓
46/198: CGMBF ✓
48/198: C ✓
50/198: CBH.F ✓
51/198: CBHD.SG ✓
52/198: ED ✓
53/198: FCG.NZ ✓
54/198: 9FM.F ✓
55/198: CRH ✓
56/198: CRH ✓
57/198: 6CMB.F ✓
58/198: 6CMB.F ✓
59/198: CMW.AX ✓
63/198: DLN.F ✓
64/198: DB ✓
65/198: DTEGY ✓
66/198: DEO ✓
67/198: DEO ✓
68/198: DS5.F ✓
69/198: GIL.VI ✓
70/198: D ✓
71/198: D ✓
72/198: KABN.F ✓
73/198: DNE.F ✓

There is no data below, the table is empty, and 2017-2019 does not seem to be available from yfiannce

In [None]:
import pandas as pd
import yfinance as yf
import time

# 读取公司表
df_meta = pd.read_csv("company_metadata.csv")

# 创建空列表保存结果
revenue_data = []

# 遍历每个 ticker
for i, row in df_meta.iterrows():
    ticker = row.get("ticker")
    company_id = row.get("company_id")

    if pd.isna(ticker):
        continue  # 没有 ticker 的跳过

    try:
        yf_ticker = yf.Ticker(ticker)
        financials = yf_ticker.financials.T  # 年份为行，指标为列
        financials.reset_index(inplace=True)
        financials.rename(columns={"index": "Fiscal Year"}, inplace=True)

        for _, fin_row in financials.iterrows():
            fiscal_year = str(fin_row["Fiscal Year"])[:4]
            if fiscal_year not in ['2017', '2018', '2019']:
                continue
            revenue = fin_row.get("Total Revenue")
            if pd.notna(revenue):
                revenue_data.append({
                    "company_id": company_id,
                    "ticker": ticker,
                    "fiscal_year": fiscal_year,
                    "total_revenue": revenue,
                    "source": "yfinance"
                })

        print(f"{i+1}/{len(df_meta)}: {ticker} ✓")
        time.sleep(0.5)  # 控速防止被封 IP
    except Exception as e:
        print(f"{i+1}/{len(df_meta)}: {ticker}  Error: {e}")

# 转为 DataFrame
df_revenue = pd.DataFrame(revenue_data)

# 保存为 CSV
df_revenue.to_csv("company_revenue.csv", index=False)


1/198: SB=F ✓
2/198: TGOPY ✓
3/198: AGOAF ✓
4/198: 7A20.F ✓
5/198: AC.TO ✓
6/198: AIXAD.XC ✓
7/198: FKM.F ✓
8/198: AKSO.OL ✓
9/198: AMUN.VI ✓
10/198: FG1.F ✓
11/198: FG1.F ✓
12/198: ARRD.HM ✓
13/198: ARRD.HM ✓
14/198: 0RUH.IL ✓
15/198: AZN ✓
16/198: MKL.F ✓
17/198: AEXAD ✓
18/198: T ✓
19/198: T ✓
20/198: QRL.F ✓
21/198: AV.L ✓
22/198: SAN ✓
23/198: BSBR ✓
24/198: 7832.T ✓
25/198: ABR.BE ✓
26/198: BCE.TO ✓
27/198: BCHEY ✓
28/198: BTG4.F ✓
29/198: BTG4.F ✓
34/198: BSX ✓
35/198: R1H.F ✓
36/198: BUHA.F ✓
37/198: SKVB.F ✓
38/198: CTYA.F ✓
39/198: CPCAF ✓
40/198: CWC.F ✓
41/198: CGF.AX ✓
42/198: GK9.F ✓
43/198: CVX ✓
44/198: CVX ✓
45/198: 2610.TW ✓
46/198: CGMBF ✓
48/198: C ✓
50/198: CBH.F ✓
51/198: CBHD.SG ✓
52/198: ED ✓
53/198: FCG.NZ ✓
54/198: 9FM.F ✓
55/198: CRH ✓
56/198: CRH ✓
57/198: 6CMB.F ✓
58/198: 6CMB.F ✓
59/198: CMW.AX ✓
63/198: DLN.F ✓
64/198: DB ✓
65/198: DTEGY ✓
66/198: DEO ✓
67/198: DEO ✓
68/198: DS5.F ✓
69/198: GIL.VI ✓
70/198: D ✓
71/198: D ✓
72/198: KABN.F ✓
73/198: DNE.F ✓

In [None]:
import yfinance as yf

ticker = yf.Ticker("AAPL")  # 替换为你感兴趣的 ticker
financials = ticker.financials.T
print(financials.index)  # 打印所有“Fiscal Year”


DatetimeIndex(['2024-09-30', '2023-09-30', '2022-09-30', '2021-09-30',
               '2020-09-30'],
              dtype='datetime64[ns]', freq=None)


In [None]:
ticker = yf.Ticker("AAPL")
income_stmt = ticker.income_stmt.T
print(income_stmt)


           Tax Effect Of Unusual Items Tax Rate For Calcs Normalized EBITDA  \
2024-09-30                         0.0              0.241    134661000000.0   
2023-09-30                         0.0              0.147    125820000000.0   
2022-09-30                         0.0              0.162    130541000000.0   
2021-09-30                         0.0              0.133    123136000000.0   
2020-09-30                         NaN                NaN               NaN   

           Net Income From Continuing Operation Net Minority Interest  \
2024-09-30                                      93736000000.0           
2023-09-30                                      96995000000.0           
2022-09-30                                      99803000000.0           
2021-09-30                                      94680000000.0           
2020-09-30                                                NaN           

           Reconciled Depreciation Reconciled Cost Of Revenue          EBITDA  \
2024-

# L

In [None]:
# 读取 parquet 文件
df1 = pd.read_parquet("../OneDrive_1_2025-6-9/compustat_metadata.parquet")


# 显示前几行
print(df1.head())


    gvkey            ISIN      SEDOL        cik        cusip  \
0  315022  [KR7104540000]  [B94CCB9]         []           []   
1  317960  [KYG887641085]  [BNLPLJ0]         []           []   
2    8085              []         []    [53495]  [675734107]   
3  309887  [INE122M01019]  [B4JYZ27]         []           []   
4   29399              []         []  [1655099]  [112830104]   

                                               conml            GICS_level_1  \
0                                  [CORENTEC CO LTD]             Health Care   
1                        [TIAN GE INTERACTIVE HLDGS]  Communication Services   
2                   [October Oil Co, OCTOBER OIL CO]                    None   
3                               [ESSAR SHIPPING LTD]                  Energy   
4  [Brookfield Real Assets Income Fund Inc, BROOK...                    None   

                       GICS_level_2                      GICS_level_3  loc  \
0  Health Care Equipment & Services  Health Care Equipme

In [None]:
import pandas as pd
import numpy as np
import ast

def extract_first_isin(x):
    # 先判断空值（仅适用于标量，不是数组）
    if isinstance(x, str):
        x = x.strip()
        if x == "":
            return None
        try:
            x_eval = ast.literal_eval(x)
            if isinstance(x_eval, (list, np.ndarray)) and len(x_eval) > 0:
                return x_eval[0]
        except:
            return None

    elif isinstance(x, (list, np.ndarray)):
        if len(x) == 0:
            return None
        return x[0]

    elif pd.isna(x):
        return None

    return None


In [None]:
df1["ISIN_cleaned"] = df1["ISIN"].apply(extract_first_isin)

# 统计
total = len(df1)
non_missing = df1["ISIN_cleaned"].notna().sum()
missing = df1["ISIN_cleaned"].isna().sum()

print(f"Total rows: {total}")
print(f"With ISIN: {non_missing}")
print(f"Missing ISIN: {missing}")
print(f"Missing ratio: {missing/total:.2%}")


Total rows: 104680
With ISIN: 59733
Missing ISIN: 44947
Missing ratio: 42.94%


In [None]:

# 读取 parquet 文件
df = pd.read_parquet("../OneDrive_1_2025-6-9/json_metadata.parquet")


# 显示前几行
print(df.head())


                                      pdf_local_path  \
0  /srv/data/mrei/davinci/davinci_data_v11_2022_u...   
1  /srv/data/mrei/davinci/davinci_data_v11_2022_u...   
2  /srv/data/mrei/davinci/davinci_data_v11_2022_u...   
3  /srv/data/mrei/davinci/davinci_data_v11_2022_u...   
4  /srv/data/mrei/davinci/davinci_data_v11_2022_u...   

                             pdf_local_path_relative  \
0  000009/Tadano_Ltd_integrated_report2021_en_7_g...   
1  000009/Tadano_Ltd_integrated_report2022_en_for...   
2  000009/Taikisha_Ltd_pdf-index-2021-01_7r58ksqc...   
3  000009/Taikisha_Ltd_pdf-index-2022-02_nllsl2qx...   
4  000009/Takasago_International_Corp_Takasago20S...   

                                              pdfurl pdf_source_path  \
0  https://www.tadano.com/upload/docs/integrated_...            None   
1  https://www.tadano.com/upload/docs/integrated_...            None   
2  https://www.taikisha-group.com/sustainability/...            None   
3  https://www.taikisha-group.com/sust

In [None]:
import pandas as pd

# 读取 parquet 文件
df = pd.read_parquet("../OneDrive_1_2025-6-9/lseg_more_metadata.parquet")


# 显示前几行
print(df.head())


   Instrument  TR.MIC TR.ExchangeCode  TR.CompanyMarketCap  \
0  LP68841490     NaN             LIP                  NaN   
1  LP68841491     NaN             LIP                  NaN   
2  LP68841495     NaN             LIP                  NaN   
3  LP68841496     NaN             LIP                  NaN   
4  LP68841497     NaN             LIP                  NaN   

                                    TR.CommonName TR.GICSSector  \
0  Desjardins Target 2026 Investment Grade Bond C          None   
1  Desjardins Target 2026 Investment Grade Bond C          None   
2  Desjardins Target 2027 Investment Grade Bond C          None   
3  Desjardins Target 2027 Investment Grade Bond C          None   
4  Desjardins Target 2027 Investment Grade Bond C          None   

  TR.GICSIndustryGroup TR.GICSIndustry TR.BusinessSummary TR.AlsoKnownAsName  \
0                 None            None               None               None   
1                 None            None               None       

In [None]:
import pandas as pd

# 读取 parquet 文件
df = pd.read_parquet("../OneDrive_1_2025-6-9/N3u_gpt_without_text.parquet")


# 显示前几行
print(df.head())


                    md5_fingerprint  \
0  3014d9f1056be7c44a1a493c806a57f4   
1  9c74edfc0053e2d55966b14b2b8f719a   
2  9c74edfc0053e2d55966b14b2b8f719a   
3  45bc567c0a633a6464c4b0fd820a6127   
4  8a0c411368909088c724f493fcca1101   

                                      pdf_local_path  \
0  /srv/data/mrei/davinci/davinci_data_v6_crawl/p...   
1  /srv/data/mrei/davinci/davinci_data_v8_public_...   
2  /srv/data/mrei/davinci/davinci_data_v11_2022_u...   
3  /srv/data/mrei/davinci/davinci_data_v6_crawl/p...   
4  /srv/data/mrei/davinci/davinci_data_v6_crawl/p...   

                             pdf_local_path_relative  \
0  000110/Unknown_financial-economic-review-1998_...   
1             000033/Unknown_5055330315_u5l5tbnz.pdf   
2             000033/Unknown_5055330315_ijjv3k4d.pdf   
3         000014/Unknown_2004-0138-2967_3cfjjajt.pdf   
4                000149/Unknown_Block-3_c2hw2oxq.pdf   

                                              pdfurl pdf_source_path  \
0  https://www2.gov

In [None]:
import pandas as pd

# 读取 parquet 文件
df = pd.read_parquet("../OneDrive_1_2025-6-9/symbols_table.parquet")


# 显示前几行
print(df.head())


                                       DocumentTitle         RIC  \
0  LUX IM European Equities DXL EUR, Open-End Fun...  LP65032832   
1  LUX IM Pictet Asian Equities DXL EUR, Open-End...  LP65032835   
2  LUX IM Morgan Stanley US Equities DXL EUR, Ope...  LP65032839   
3  LUX IM Pictet Asian Equities DXBL EUR, Open-En...  LP65032840   
4  JPM Europe Strategic Value I Dist EUR, Open-En...  LP65032853   

      IssueISIN    SEDOL FundClassLipperID TickerSymbol  Instrument CUSIP  \
0  LU0255269960     None          65032832         None  LP65032832  None   
1  LU0255273566     None          65032835         None  LP65032835  None   
2  LU0255272089     None          65032839         None  LP65032839  None   
3  LU0255236019     None          65032840         None  LP65032840  None   
4  LU0248049685  B11Z367          65032853         None  LP65032853  None   

  IssuerOAPermID TR.HQCountryCode  TR.InstrumentIsActive  \
0           None               LU                   True   
1       