In [12]:
import os
import pandas as pd

def get_taq_link(db, permno_list=None, year=None, start_year=2000):
    """
    Get TAQ linking table to link daily TAQ to CRSP.

    Parameters
    ----------
    db : object
        WRDS connection with a .raw_sql() method.
    permno_list : list[int] | None
        PERMNOs to filter on. If None, returns all PERMNOs.
    year : int | 'all' | None
        If int, restricts to that calendar year (YYYY).
        If 'all' or None, no single-year filter is applied.
    start_year : int | None
        If set, restricts to dates >= January 1 of start_year.

    Returns
    -------
    pandas.DataFrame
        Columns (typical): permno, date, cusip, sym_root, sym_suffix
    """
    # Ensure required directories exist (optional)
    os.makedirs('data', exist_ok=True)
    os.makedirs('data/crsp', exist_ok=True)

    # Build predicates and parameters safely
    where_clauses = []
    params = {}

    if permno_list:
        # Make a list of positional parameters: %(p0)s, %(p1)s, ...
        ph = []
        for i, p in enumerate(permno_list):
            key = f"p{i}"
            params[key] = int(p)  # ensure int
            ph.append(f"%({key})s")
        where_clauses.append(f"a.permno IN ({', '.join(ph)})")

    if isinstance(year, int):
        params["y_start"] = f"{year}-01-01"
        params["y_end"]   = f"{year}-12-31"
        where_clauses.append("a.date BETWEEN %(y_start)s AND %(y_end)s")

    if start_year is not None and not isinstance(year, int):
        # Only apply start_year if you didn't already pin to a specific year
        params["start_year"] = f"{start_year}-01-01"
        where_clauses.append("a.date >= %(start_year)s")

    # Default WHERE to true if no filters provided
    where_sql = " AND ".join(where_clauses) if where_clauses else "1=1"

    sql = f"""
        SELECT
            a.permno,
            a.date,
            a.cusip,
            a.sym_root,
            a.sym_suffix
        FROM wrdsapps.taqmclink a
        WHERE
            a.date >= DATE '2024-01-01'
        ORDER BY a.date;
    """

    df = db.raw_sql(sql, params=params)
    return df


In [2]:
import os
import dotenv
import pandas as pd

import wrds

def connect_wrds(username, password):
    print(f"Connecting to WRDS with username: {username}")
    db = wrds.Connection(wrds_username=username, wrds_password=password, use_keyring=False)
    print("Connected to WRDS!")
    return db


dotenv.load_dotenv()


# hyperparameters
FACTOR_PATH = 'data/factors'

# connect to db
db = connect_wrds(username=os.getenv("WRDS_USERNAME"), password=os.getenv("WRDS_PASSWORD"))

Connecting to WRDS with username: andrekraemer
Loading library list...
Done
Connected to WRDS!


In [14]:
taq_link = get_taq_link(db)
taq_link.head()

Unnamed: 0,permno,date,cusip,sym_root,sym_suffix
0,22563,2024-01-02,45827K10,INTE,
1,92220,2024-01-02,57479510,MASI,
2,24090,2024-01-02,25365120,DBD,
3,84319,2024-01-02,53679710,LAD,
4,89900,2024-01-02,87162W10,SNX,


In [52]:
# Change date to datetime
taq_link['date'] = pd.to_datetime(taq_link['date'], format='%Y-%m-%d', errors='coerce')

In [59]:
# Load TAQ data
df = pd.read_csv("data/taqtrades2024.csv")
print(len(df))
df.head()

2426538


Unnamed: 0,date,SYM_ROOT,SYM_SUFFIX,nb,ns,sb,ss,vb,vs
0,02JAN2024,A,,719,644,31347,27518,4361987.0,3825107.0
1,02JAN2024,AA,,1379,1387,206068,202722,6905840.0,6791301.0
2,02JAN2024,AAA,,3,5,442,153,11075.47,3825.193
3,02JAN2024,AAAU,,256,159,189555,216372,3872967.0,4421058.0
4,02JAN2024,AACG,,16,12,1539,1325,1711.869,1437.439


In [None]:
# No empty rows
num_rows_all_na = df.isna().all(axis=1).sum()
print(num_rows_all_na)

0


In [71]:
# Convert TAQ date
df['date'] = pd.to_datetime(df['date'], format='%d%b%Y', errors='coerce')

# Change column names
df.rename(columns={'SYM_ROOT': 'sym_root', 'SYM_SUFFIX': 'sym_suffix'}, inplace=True)
df.head()


Unnamed: 0,date,sym_root,sym_suffix,nb,ns,sb,ss,vb,vs
0,2024-01-02,A,,719,644,31347,27518,4361987.0,3825107.0
1,2024-01-02,AA,,1379,1387,206068,202722,6905840.0,6791301.0
2,2024-01-02,AAA,,3,5,442,153,11075.47,3825.193
3,2024-01-02,AAAU,,256,159,189555,216372,3872967.0,4421058.0
4,2024-01-02,AACG,,16,12,1539,1325,1711.869,1437.439


In [None]:
# Link TAQ with CRSP link table
linked_table = df.merge(
    taq_link[['date', 'sym_root', 'sym_suffix', 'permno']],
    on=['date', 'sym_root', 'sym_suffix'],
    how='inner',
    validate='many_to_many'
)

In [None]:
# Get number of rows left
print(len(linked_table))
linked_table.head(10)

1538287


Unnamed: 0,date,sym_root,sym_suffix,nb,ns,sb,ss,vb,vs,permno
0,2024-01-02,A,,719,644,31347,27518,4361987.0,3825107.0,87432
1,2024-01-02,AA,,1379,1387,206068,202722,6905840.0,6791301.0,16347
2,2024-01-02,AAA,,3,5,442,153,11075.47,3825.193,23483
3,2024-01-02,AACI,,2,0,100,0,1079.95,0.0,22271
4,2024-01-02,AADI,,113,131,26505,47382,54978.34,97274.73,17869
5,2024-01-02,AADR,,4,4,29,121,1598.49,6644.095,10113
6,2024-01-02,AAGR,,114,57,39412,26991,43048.45,28279.92,22253
7,2024-01-02,AAL,,5086,3881,2956421,2128543,40037420.0,28831910.0,21020
8,2024-01-02,AAMC,,7,10,42,159,173.4055,643.2273,14127
9,2024-01-02,AAME,,16,9,4042,2114,9835.415,5086.436,15580


In [None]:
# Sanity Check. ETFs are excluded
print('AAAU' in df['sym_root'].values)
print('AAAU' in taq_link['sym_root'].values)
print('AAAU' in linked_table['sym_root'].values)

True
False
False


In [None]:
##############################################

# Now: Get IBES data

##############################################

In [5]:
# Load IBES data
import os
import pandas as pd

def get_ibes(db, permno_list=None, year=None, start_year=2000):
    """
    Get IBES price target data.

    Parameters
    ----------
    db : object
        WRDS connection with a .raw_sql() method.
    permno_list : list[int] | None
        PERMNOs to filter on. If None, returns all PERMNOs.
    year : int | 'all' | None
        If int, restricts to that calendar year (YYYY).
        If 'all' or None, no single-year filter is applied.
    start_year : int | None
        If set, restricts to dates >= January 1 of start_year.

    Returns
    -------
    pandas.DataFrame
        Columns (typical):
    """
    # Ensure required directories exist (optional)
    os.makedirs('data', exist_ok=True)
    os.makedirs('data/crsp', exist_ok=True)

    # Build predicates and parameters safely
    where_clauses = []
    params = {}

    if permno_list:
        # Make a list of positional parameters: %(p0)s, %(p1)s, ...
        ph = []
        for i, p in enumerate(permno_list):
            key = f"p{i}"
            params[key] = int(p)  # ensure int
            ph.append(f"%({key})s")
        where_clauses.append(f"a.permno IN ({', '.join(ph)})")

    if isinstance(year, int):
        params["y_start"] = f"{year}-01-01"
        params["y_end"]   = f"{year}-12-31"
        where_clauses.append("a.date BETWEEN %(y_start)s AND %(y_end)s")

    if start_year is not None and not isinstance(year, int):
        # Only apply start_year if you didn't already pin to a specific year
        params["start_year"] = f"{start_year}-01-01"
        where_clauses.append("a.date >= %(start_year)s")

    # Default WHERE to true if no filters provided
    where_sql = " AND ".join(where_clauses) if where_clauses else "1=1"

    sql = f"""
        SELECT
            *
        FROM ibes.ptgdet a
        WHERE
            a.ANNDATS >= DATE '2023-12-31'
        ORDER BY a.ANNDATS;
    """

    df = db.raw_sql(sql, params=params)
    return df



In [6]:
# Load IBES data
ibes = get_ibes(db)

In [7]:
print(len(ibes))
ibes.head()

486446


Unnamed: 0,ticker,cusip,oftic,cname,actdats,estimid,alysnam,horizon,value,estcur,curr,amaskcd,usfirm,measure,acttims,anndats,anntims
0,0312,64119V30,NTST,NETSTREIT US,2023-12-31,JEFFEREG,TSAI L,12,20.0,USD,USD,122548.0,1,PTG,20:59:26,2023-12-31,20:05:00
1,037I,03750L10,AIRC,APARTMENT INCOME,2023-12-31,JEFFEREG,TSAI L,12,32.0,USD,USD,122548.0,1,PTG,22:08:49,2023-12-31,22:05:00
2,@0199,FIBD0FRL,RBLBAN,RBL BANK,2023-12-31,MERRINTL,SWAMINATHAN A,12,165.0,INR,INR,132074.0,0,PTG,16:08:19,2023-12-31,16:04:00
3,@01FW,FCBZ0797,603345,ANJOY FOODS,2024-01-01,CMCHANT,YU J,6,124.0,CNY,CNY,191504.0,0,PTG,21:13:28,2023-12-31,22:22:00
4,@03A5,FCBMBZ0D,002984,QD SENTURY,2023-12-31,CICC,FU K,12,25.143,CNY,CNY,182056.0,0,PTG,20:34:43,2023-12-31,20:18:00


In [8]:
# Change data format
ibes['horizon'] = pd.to_numeric(ibes['horizon'], errors='coerce')
ibes['anndats'] = pd.to_datetime(ibes['anndats'], format='%Y-%m-%d', errors='coerce')
ibes['actdats'] = pd.to_datetime(ibes['actdats'], format='%Y-%m-%d', errors='coerce')

# Change column name
ibes.rename(columns={'cusip': 'ncusip'}, inplace=True)

In [10]:
# WRDS recommends to only keep rows for which activation date is later than announcement date. Others are errors (0.08%)
ibes = ibes.loc[ibes["actdats"] >= ibes["anndats"]]

In [11]:
# Only keep US firms and only 12m price target forecasts
ibes = ibes.loc[(ibes["usfirm"] == 1) & (ibes["horizon"] == 12)]
ibes.head()

Unnamed: 0,ticker,ncusip,oftic,cname,actdats,estimid,alysnam,horizon,value,estcur,curr,amaskcd,usfirm,measure,acttims,anndats,anntims
0,0312,64119V30,NTST,NETSTREIT US,2023-12-31,JEFFEREG,TSAI L,12,20.0,USD,USD,122548.0,1,PTG,20:59:26,2023-12-31,20:05:00
1,037I,03750L10,AIRC,APARTMENT INCOME,2023-12-31,JEFFEREG,TSAI L,12,32.0,USD,USD,122548.0,1,PTG,22:08:49,2023-12-31,22:05:00
43,ADC,00849210,ADC,AGREE REALTY,2023-12-31,JEFFEREG,TSAI L,12,70.0,USD,USD,122548.0,1,PTG,20:59:27,2023-12-31,20:05:00
44,AMBP,74340W10,PLD,PROLOGIS MD,2023-12-31,JEFFEREG,PETERSEN J,12,157.0,USD,USD,131583.0,1,PTG,23:08:08,2023-12-31,23:05:00
45,AMHH,02665T30,AMH,AMERICAN HOMES 4,2023-12-31,JEFFEREG,TSAI L,12,41.0,USD,USD,122548.0,1,PTG,22:08:50,2023-12-31,22:05:00


In [None]:
##############################################

# Now IBES - CRSP

##############################################

In [12]:
import os
import pandas as pd

def get_ibes_link(db, permno_list=None, year=None, start_year=2000):
    """
    Get IBES linking table to link IBES data to CRSP.

    Parameters
    ----------
    db : object
        WRDS connection with a .raw_sql() method.
    permno_list : list[int] | None
        PERMNOs to filter on. If None, returns all PERMNOs.
    year : int | 'all' | None
        If int, restricts to that calendar year (YYYY).
        If 'all' or None, no single-year filter is applied.
    start_year : int | None
        If set, restricts to dates >= January 1 of start_year.

    Returns
    -------
    pandas.DataFrame
        Columns (typical): ticker, permno, ncusip, sdate, edate, score
    """
    # Ensure required directories exist (optional)
    os.makedirs('data', exist_ok=True)
    os.makedirs('data/crsp', exist_ok=True)

    # Build predicates and parameters safely
    where_clauses = []
    params = {}

    if permno_list:
        # Make a list of positional parameters: %(p0)s, %(p1)s, ...
        ph = []
        for i, p in enumerate(permno_list):
            key = f"p{i}"
            params[key] = int(p)  # ensure int
            ph.append(f"%({key})s")
        where_clauses.append(f"a.permno IN ({', '.join(ph)})")

    if isinstance(year, int):
        params["y_start"] = f"{year}-01-01"
        params["y_end"]   = f"{year}-12-31"
        where_clauses.append("a.date BETWEEN %(y_start)s AND %(y_end)s")

    if start_year is not None and not isinstance(year, int):
        # Only apply start_year if you didn't already pin to a specific year
        params["start_year"] = f"{start_year}-01-01"
        where_clauses.append("a.date >= %(start_year)s")

    # Default WHERE to true if no filters provided
    where_sql = " AND ".join(where_clauses) if where_clauses else "1=1"

    sql = f"""
        SELECT
            *
        FROM wrdsapps.ibcrsphist a
    """

    df = db.raw_sql(sql, params=params)
    return df


In [13]:
# Load ibes link data
ibes_link = get_ibes_link(db)
ibes_link.head(20)

Unnamed: 0,ticker,permno,ncusip,sdate,edate,score
0,0000,14471.0,87482X10,2014-02-20,2016-08-31,1.0
1,0001,14392.0,26878510,2014-02-20,2019-05-22,1.0
2,0001,,,2019-06-20,2023-12-31,6.0
3,0004,14418.0,02504D10,2014-02-20,2018-08-24,1.0
4,000R,14378.0,14163310,2014-02-20,2020-02-10,1.0
5,000V,14423.0,15117E10,2014-03-20,2016-03-22,1.0
6,000V,14423.0,28249U10,2016-04-14,2024-01-07,1.0
7,000V,14423.0,28249U20,2024-01-18,2024-04-10,1.0
8,000V,,,2024-04-18,,6.0
9,000Y,14436.0,90400D10,2014-03-20,2024-12-31,1.0


In [14]:
# Change data format
ibes_link['sdate'] = pd.to_datetime(ibes_link['sdate'], format='%Y-%m-%d', errors='coerce')
ibes_link['edate'] = pd.to_datetime(ibes_link['edate'], format='%Y-%m-%d', errors='coerce')
ibes_link['edate'] = pd.to_datetime(ibes_link['edate'], errors='coerce').fillna(pd.Timestamp.today().normalize()) # Fill NA with date of today
ibes_link.head(25)

Unnamed: 0,ticker,permno,ncusip,sdate,edate,score
0,0000,14471.0,87482X10,2014-02-20,2016-08-31,1.0
1,0001,14392.0,26878510,2014-02-20,2019-05-22,1.0
2,0001,,,2019-06-20,2023-12-31,6.0
3,0004,14418.0,02504D10,2014-02-20,2018-08-24,1.0
4,000R,14378.0,14163310,2014-02-20,2020-02-10,1.0
5,000V,14423.0,15117E10,2014-03-20,2016-03-22,1.0
6,000V,14423.0,28249U10,2016-04-14,2024-01-07,1.0
7,000V,14423.0,28249U20,2024-01-18,2024-04-10,1.0
8,000V,,,2024-04-18,2025-09-27,6.0
9,000Y,14436.0,90400D10,2014-03-20,2024-12-31,1.0


In [15]:
# Merge IBES with link table. Take into account sdate and edate
ibes['__idx'] = ibes.index
ibes_merged = ibes.merge(ibes_link[['ncusip','permno','sdate','edate']], on='ncusip', how='left')

# keep only exact link-window matches
exact = ibes_merged.loc[
    ibes_merged['permno'].notna()
    & (ibes_merged['anndats'] >= ibes_merged['sdate'])
    & (ibes_merged['anndats'] <= ibes_merged['edate'])
].copy()

exact = exact.sort_values(['__idx']).drop_duplicates('__idx', keep='first')


In [19]:
print(len(exact))
exact.head(10)

108543


Unnamed: 0,ticker,ncusip,oftic,cname,actdats,estimid,alysnam,horizon,value,estcur,...,amaskcd,usfirm,measure,acttims,anndats,anntims,__idx,permno,sdate,edate
0,0312,64119V30,NTST,NETSTREIT US,2023-12-31,JEFFEREG,TSAI L,12,20.0,USD,...,122548.0,1,PTG,20:59:26,2023-12-31,20:05:00,0,19601,2020-09-17,2024-12-31
1,037I,03750L10,AIRC,APARTMENT INCOME,2023-12-31,JEFFEREG,TSAI L,12,32.0,USD,...,122548.0,1,PTG,22:08:49,2023-12-31,22:05:00,1,20191,2020-12-17,2024-06-27
2,ADC,00849210,ADC,AGREE REALTY,2023-12-31,JEFFEREG,TSAI L,12,70.0,USD,...,122548.0,1,PTG,20:59:27,2023-12-31,20:05:00,43,80412,1994-11-17,2024-12-31
3,AMBP,74340W10,PLD,PROLOGIS MD,2023-12-31,JEFFEREG,PETERSEN J,12,157.0,USD,...,131583.0,1,PTG,23:08:08,2023-12-31,23:05:00,44,85592,2011-06-16,2024-12-31
4,AMHH,02665T30,AMH,AMERICAN HOMES 4,2023-12-31,JEFFEREG,TSAI L,12,41.0,USD,...,122548.0,1,PTG,22:08:50,2023-12-31,22:05:00,45,14061,2013-09-19,2024-12-31
5,AMT2,03027X10,AMT,AMERICAN TOWER,2023-12-31,JEFFEREG,PETERSEN J,12,235.0,USD,...,131583.0,1,PTG,18:08:50,2023-12-31,18:05:00,46,86111,2012-01-19,2024-12-31
6,ARE1,01527110,ARE,ALEXANDRIA REE,2023-12-31,JEFFEREG,ABRAMOWITZ P,12,146.0,USD,...,194509.0,1,PTG,19:08:07,2023-12-31,19:05:00,47,84767,1997-07-17,2024-12-31
7,AVN,05348410,AVB,AVALONBAY US,2023-12-31,JEFFEREG,TSAI L,12,182.0,USD,...,122548.0,1,PTG,22:08:51,2023-12-31,22:05:00,48,80381,1998-10-15,2024-12-31
8,BRXN,11120U10,BRX,BRIXMOR PPTY GP,2023-12-31,JEFFEREG,TSAI L,12,23.0,USD,...,122548.0,1,PTG,21:09:39,2023-12-31,21:05:00,49,14181,2013-11-14,2024-12-31
9,BXP,10112110,BXP,BOSTON PPTY,2023-12-31,JEFFEREG,ABRAMOWITZ P,12,80.0,USD,...,194509.0,1,PTG,17:08:11,2023-12-31,17:05:00,50,85058,1997-07-17,2024-12-31


In [20]:
test = exact.loc[exact["oftic"] == "AAPL"]
test["permno"].unique()

<IntegerArray>
[14593]
Length: 1, dtype: Int64

In [None]:
##############################################

# Now: Get IBES data: Summary data

##############################################

In [1]:
# Load IBES data
import os
import pandas as pd

def get_ibes_summary(db, permno_list=None, year=None, start_year=2000):
    """
    Get consensus IBES price target data.

    Parameters
    ----------
    db : object
        WRDS connection with a .raw_sql() method.
    permno_list : list[int] | None
        PERMNOs to filter on. If None, returns all PERMNOs.
    year : int | 'all' | None
        If int, restricts to that calendar year (YYYY).
        If 'all' or None, no single-year filter is applied.
    start_year : int | None
        If set, restricts to dates >= January 1 of start_year.

    Returns
    -------
    pandas.DataFrame
        Columns (typical):
    """
    # Ensure required directories exist (optional)
    os.makedirs('data', exist_ok=True)
    os.makedirs('data/crsp', exist_ok=True)

    # Build predicates and parameters safely
    where_clauses = []
    params = {}

    if permno_list:
        # Make a list of positional parameters: %(p0)s, %(p1)s, ...
        ph = []
        for i, p in enumerate(permno_list):
            key = f"p{i}"
            params[key] = int(p)  # ensure int
            ph.append(f"%({key})s")
        where_clauses.append(f"a.permno IN ({', '.join(ph)})")

    if isinstance(year, int):
        params["y_start"] = f"{year}-01-01"
        params["y_end"]   = f"{year}-12-31"
        where_clauses.append("a.date BETWEEN %(y_start)s AND %(y_end)s")

    if start_year is not None and not isinstance(year, int):
        # Only apply start_year if you didn't already pin to a specific year
        params["start_year"] = f"{start_year}-01-01"
        where_clauses.append("a.date >= %(start_year)s")

    # Default WHERE to true if no filters provided
    where_sql = " AND ".join(where_clauses) if where_clauses else "1=1"

    sql = f"""
        SELECT
            *
        FROM ibes.ptgsum a
        WHERE
            a.STATPERS >= DATE '2024-12-31'
        ORDER BY a.STATPERS;
    """

    df = db.raw_sql(sql, params=params)
    return df



In [3]:
# Load consensus data
consensus = get_ibes_summary(db)

In [None]:
# Get glimpse. The data is at monthly level
consensus.head()

Unnamed: 0,ticker,cusip,oftic,cname,statpers,numest,numup4w,numdown4w,numup1m,numdown1m,meanptg,medptg,stdev,ptghigh,ptglow,curr,usfirm,measure
0,000Y,90400D10,RARE,ULTRAGENYX PHARMACEUTICAL INC,2025-01-16,20.0,2.0,0.0,2.0,0.0,91.5,84.0,26.007,140.0,48.0,USD,1,PTG
1,001A,14216R10,CARM,CARISMA THERAPEUTICS INC,2025-01-16,3.0,0.0,1.0,0.0,1.0,4.567,1.0,6.439,12.0,0.7,USD,1,PTG
2,001J,49926D10,KN,KNOWLES CORP,2025-01-16,4.0,0.0,0.0,0.0,0.0,21.75,21.5,2.5,25.0,19.0,USD,1,PTG
3,001K,45780R10,IBP,INSTALLED BUILDING PRODUCTS INC,2025-01-16,11.0,0.0,1.0,0.0,1.0,241.727,240.0,19.468,270.0,197.0,USD,1,PTG
4,001M,45780L10,INGN,INOGEN INC,2025-01-16,2.0,0.0,0.0,0.0,0.0,10.0,10.0,0.0,10.0,10.0,USD,1,PTG


In [21]:
# Now just merge CRSP.ncusip with consensus.cusip as above. Please be aware that the van Binsbergen paper requires all variables to come from dates before STATPERS (mitigate look-ahead bias)

In [None]:
##############################################

# Now: Get Financial Ratios by WRDS

##############################################

In [58]:
# Load IBES data
import os
import pandas as pd

def get_ratios(db, permno_list=None, year=None, start_year=2000):
    """
    Get financial ratios by WRDS

    Parameters
    ----------
    db : object
        WRDS connection with a .raw_sql() method.
    permno_list : list[int] | None
        PERMNOs to filter on. If None, returns all PERMNOs.
    year : int | 'all' | None
        If int, restricts to that calendar year (YYYY).
        If 'all' or None, no single-year filter is applied.
    start_year : int | None
        If set, restricts to dates >= January 1 of start_year.

    Returns
    -------
    pandas.DataFrame
        Columns (typical):
    """
    # Ensure required directories exist (optional)
    os.makedirs('data', exist_ok=True)
    os.makedirs('data/crsp', exist_ok=True)

    # Build predicates and parameters safely
    where_clauses = []
    params = {}

    if permno_list:
        # Make a list of positional parameters: %(p0)s, %(p1)s, ...
        ph = []
        for i, p in enumerate(permno_list):
            key = f"p{i}"
            params[key] = int(p)  # ensure int
            ph.append(f"%({key})s")
        where_clauses.append(f"a.permno IN ({', '.join(ph)})")

    if isinstance(year, int):
        params["y_start"] = f"{year}-01-01"
        params["y_end"]   = f"{year}-12-31"
        where_clauses.append("a.date BETWEEN %(y_start)s AND %(y_end)s")

    if start_year is not None and not isinstance(year, int):
        # Only apply start_year if you didn't already pin to a specific year
        params["start_year"] = f"{start_year}-01-01"
        where_clauses.append("a.date >= %(start_year)s")

    # Default WHERE to true if no filters provided
    where_sql = " AND ".join(where_clauses) if where_clauses else "1=1"

    sql = f"""
        SELECT
            *
        FROM wrdsapps_finratio_ccm.firm_ratio_ccm a
        WHERE
            a.public_date >= DATE '2024-08-31'
        ORDER BY a.public_date;
    """

    df = db.raw_sql(sql, params=params)
    return df



In [59]:
ratios = get_ratios(db)

In [56]:
ratios.head(10)

Unnamed: 0,gvkey,permno,adate,qdate,public_date,capei,be,bm,evm,pe_op_basic,...,ffi30_desc,ffi30,ffi38_desc,ffi38,ffi48_desc,ffi48,ffi49_desc,ffi49,ticker,cusip
0,1004,54594.0,2024-05-31,2024-08-31,2024-12-31,36.917074,1234.2,0.522517,13.666541,19.831715,...,WHLSL,26.0,WHLSL,33.0,WHLSL,41.0,WHLSL,42.0,AIR,00036110
1,1045,21020.0,2023-12-31,2024-09-30,2024-12-31,-6.558704,,,9.678833,12.104167,...,TRANS,25.0,TRANS,26.0,TRANS,40.0,TRANS,41.0,AAL,02376R10
2,1050,11499.0,2023-12-31,2024-09-30,2024-12-31,92.871805,250.879,0.254336,18.568135,68.704545,...,FABPR,13.0,MACHN,21.0,MACH,21.0,MACH,21.0,CECO,12514110
3,1075,27991.0,2023-12-31,2024-09-30,2024-12-31,17.572396,9318.005,0.925614,10.521676,16.11597,...,UTIL,20.0,UTILS,29.0,UTIL,31.0,UTIL,31.0,PNW,72348410
4,1076,10517.0,2023-12-31,2024-09-30,2024-12-31,12.070653,712.523,0.353711,1.192611,13.993377,...,FIN,29.0,MONEY,35.0,BANKS,44.0,BANKS,45.0,PRG,74319R10
5,1078,20482.0,2023-12-31,2024-09-30,2024-12-31,34.274869,39796.0,0.201249,20.367471,32.502874,...,HLTH,8.0,INSTR,24.0,MEDEQ,12.0,MEDEQ,12.0,ABT,00282410
6,1104,60038.0,2023-12-31,2024-09-30,2024-12-31,13.992719,107.216,0.688274,9.600298,13.774908,...,CNSTR,11.0,MTLPR,20.0,BLDMT,17.0,BLDMT,17.0,ACU,00481610
7,1117,10779.0,2023-12-31,2024-09-30,2024-12-31,-40.666802,25.987,0.335029,7.935223,24.319149,...,BUSEQ,23.0,ELCTR,22.0,CHIPS,36.0,CHIPS,37.0,BKTI,05587G20
8,1121,61487.0,2023-12-31,2024-09-30,2024-12-31,26.516892,84.78,1.217998,7.396624,,...,WHLSL,26.0,WHLSL,33.0,WHLSL,41.0,WHLSL,42.0,AE,00635130
9,1161,61241.0,2023-12-31,2024-09-30,2024-12-31,114.967073,58147.0,0.21835,56.857059,95.865079,...,BUSEQ,23.0,ELCTR,22.0,CHIPS,36.0,CHIPS,37.0,AMD,00790310


In [60]:
print(len(ratios))

18647


In [69]:
# Example for AAPL
apple = (ratios.loc[ratios["gvkey"] == "001690"])
apple.head(10)

Unnamed: 0,gvkey,permno,adate,qdate,public_date,capei,be,bm,evm,pe_op_basic,...,ffi30_desc,ffi30,ffi38_desc,ffi38,ffi48_desc,ffi48,ffi49_desc,ffi49,ticker,cusip
598,1690,14593.0,2023-09-30,2024-06-30,2024-08-31,41.107132,66708.0,0.020807,22.59993,34.69697,...,BUSEQ,23.0,ELCTR,22.0,CHIPS,36.0,CHIPS,37.0,AAPL,3783310
5461,1690,14593.0,2023-09-30,2024-06-30,2024-09-30,41.584866,66708.0,0.020807,22.59993,35.30303,...,BUSEQ,23.0,ELCTR,22.0,CHIPS,36.0,CHIPS,37.0,AAPL,3783310
7984,1690,14593.0,2023-09-30,2024-06-30,2024-10-31,40.316905,66708.0,0.020807,22.59993,34.228788,...,BUSEQ,23.0,ELCTR,22.0,CHIPS,36.0,CHIPS,37.0,AAPL,3783310
12681,1690,14593.0,2024-09-30,2024-09-30,2024-11-30,40.524578,57247.0,0.016253,27.707967,35.004425,...,BUSEQ,23.0,ELCTR,22.0,CHIPS,36.0,CHIPS,37.0,AAPL,3783310
16636,1690,14593.0,2024-09-30,2024-09-30,2024-12-31,42.759722,57247.0,0.016253,27.707967,36.935103,...,BUSEQ,23.0,ELCTR,22.0,CHIPS,36.0,CHIPS,37.0,AAPL,3783310


In [38]:
db.list_tables("wrdsapps_finratio")

['firm_ratio', 'id']

In [39]:
db.list_tables("wrdsapps_finratio_ccm")

['firm_ratio_ccm', 'id_ccm']