# [WRDS Dummy Data](https://www.tidy-finance.org/python/wrds-dummy-data.html)

In [4]:
import pandas as pd
import numpy as np
import sqlite3
import string

In [5]:
tidy_finance = sqlite3.connect(database="data/tidy_finance_python.sqlite")

In [3]:
np.random.seed(1234)

start_date = pd.Timestamp("2003-01-01")
end_date = pd.Timestamp("2022-12-31")

dummy_years = np.arange(start_date.year, end_date.year+1, 1)
dummy_months = pd.date_range(start_date, end_date, freq="ME")
dummy_days = pd.date_range(start_date, end_date, freq="D")

# [Create Stock Dummy Data](https://www.tidy-finance.org/python/wrds-dummy-data.html#create-stock-dummy-data)

In [4]:
number_of_stocks = 100

industries = pd.DataFrame({
    "industry": ["Agriculture", "Construction", "Finance",
                 "Manufacturing", "Mining", "Public", "Retail",
                 "Services", "Transportation", "Utilities", "Wholesale"],
    "n": [81, 287, 4682, 8584, 1287, 1974, 1571, 4277, 1249, 457, 904],
    "prob": [0.00319, 0.0113, 0.185, 0.339, 0.0508, 0.0779, 
           0.0620, 0.169, 0.0493, 0.0180, 0.03451]
})

exchanges = pd.DataFrame({
    "exchange": ["AMEX", "NASDAQ", "NYSE"],
    "n": [2893, 17236, 5553],
    "prob": [0.113, 0.671, 0.216]
})

stock_identifiers_list = []
for x in range(1, number_of_stocks+1):
    exchange = np.random.choice(exchanges["exchange"], p=exchanges["prob"])
    industry = np.random.choice(industries["industry"], p=industries["prob"])

    exchcd_mapping = {
        "NYSE": np.random.choice([1, 31]),
        "AMEX": np.random.choice([2, 32]),
        "NASDAQ": np.random.choice([3, 33])
    }

    siccd_mapping = {
        "Agriculture": np.random.randint(1, 1000),
        "Mining": np.random.randint(1000, 1500),
        "Construction": np.random.randint(1500, 1800),
        "Manufacturing": np.random.randint(1800, 4000),
        "Transportation": np.random.randint(4000, 4900),
        "Utilities": np.random.randint(4900, 5000),
        "Wholesale": np.random.randint(5000, 5200),
        "Retail": np.random.randint(5200, 6000),
        "Finance": np.random.randint(6000, 6800),
        "Services": np.random.randint(7000, 9000),
        "Public": np.random.randint(9000, 10000)
    }

    stock_identifiers_list.append({
        "permno": x,
        "gvkey": str(x+10000),
        "exchange": exchange,
        "industry": industry,
        "exchcd": exchcd_mapping[exchange],
        "siccd": siccd_mapping[industry]
    })

stock_identifiers = pd.DataFrame(stock_identifiers_list)

In [5]:
stock_panel_yearly = pd.DataFrame({
    "gvkey": np.tile(stock_identifiers["gvkey"], len(dummy_years)),
    "year": np.repeat(dummy_years, len(stock_identifiers))
})

stock_panel_monthly = pd.DataFrame({
    "permno": np.tile(stock_identifiers["permno"], len(dummy_months)),
    "gvkey": np.tile(stock_identifiers["gvkey"], len(dummy_months)),
    "month": np.repeat(dummy_months, len(stock_identifiers)),
    "siccd": np.tile(stock_identifiers["siccd"], len(dummy_months)),
    "industry": np.tile(stock_identifiers["industry"], len(dummy_months)),
    "exchcd": np.tile(stock_identifiers["exchcd"], len(dummy_months)),
    "exchange": np.tile(stock_identifiers["exchange"], len(dummy_months))
})

stock_panel_daily = pd.DataFrame({
    "permno": np.tile(stock_identifiers["permno"], len(dummy_days)),
    "date": np.repeat(dummy_days, len(stock_identifiers))
})

## [Dummy beta table](https://www.tidy-finance.org/python/wrds-dummy-data.html#dummy-beta-table)

In [6]:
beta_dummy = (stock_panel_monthly
    .assign(
        beta_monthly=np.random.normal(
            loc=1, scale=1, size=len(stock_panel_monthly)
    ),
        beta_daily=lambda x: (
            x["beta_monthly"]+np.random.normal(scale=0.01, size=len(x))
        )
    )
)

(beta_dummy
    .to_sql(name="beta",
            con=tidy_finance,
            if_exists="replace",
            index = False)
)

24000

## [Dummy compustat table](https://www.tidy-finance.org/python/wrds-dummy-data.html#dummy-compustat-table)

In [7]:
relevant_columns = [
    "seq", "ceq", "at", "lt", "txditc", "txdb", "itcb",
    "pstkrv", "pstkl", "pstk", "capx", "oancf", "sale",
    "cogs", "xint", "xsga", "be", "op", "at_lag", "inv"
]

commands = {
    col: np.random.rand(len(stock_panel_yearly)) for col in relevant_columns
}

compustat_dummy = (
    stock_panel_yearly
    .assign(
        datadate=lambda x: pd.to_datetime(x["year"].astype(str)+"-12-31")
    )
    .assign(**commands)
)

(compustat_dummy
    .to_sql(name="compustat",
            con=tidy_finance,
            if_exists="replace",
            index=False)
)

2000

## [Dummy crsp_monthly table](https://www.tidy-finance.org/python/wrds-dummy-data.html#dummy-crsp_monthly-table)

In [8]:
crsp_monthly_dummy = (stock_panel_monthly
    .assign(
        date=lambda x: x["month"]+pd.offsets.MonthEnd(-1),
        ret=lambda x: np.fmax(np.random.normal(size=len(x)), -1),
        ret_excess=lambda x: (
            np.fmax(x["ret"]-np.random.uniform(0, 0.0025, len(x)), -1)
        ),
        shrout=1000*np.random.uniform(1, 50, len(stock_panel_monthly)),
        altprc=np.random.uniform(0, 1000, len(stock_panel_monthly))
    )
    .assign(mktcap=lambda x: x["shrout"]*x["altprc"])
    .sort_values(by=["permno", "month"])
    .assign(
        mktcap_lag=lambda x: (x.groupby("permno")["mktcap"].shift(1))
    )
    .reset_index(drop=True)
)

(crsp_monthly_dummy
    .to_sql(name="crsp_monthly",
            con=tidy_finance,
            if_exists="replace",
            index=False)
)

24000

## [Dummy crsp_daily table](https://www.tidy-finance.org/python/wrds-dummy-data.html#dummy-crsp_daily-table)

In [9]:
crsp_daily_dummy = (stock_panel_daily
    .assign(
        month=lambda x: x["date"]-pd.offsets.MonthBegin(1),
        ret_excess=lambda x: np.fmax(np.random.normal(size=len(x)), -1)
    )
    .reset_index(drop=True)
)

(crsp_daily_dummy
    .to_sql(name="crsp_daily",
            con=tidy_finance,
            if_exists="replace",
            index=False)
)

730500

# [Create Bond Dummy Data](https://www.tidy-finance.org/python/wrds-dummy-data.html#create-bond-dummy-data)

## [Dummy fisd data](https://www.tidy-finance.org/python/wrds-dummy-data.html#dummy-fisd-data)

In [11]:
number_of_bonds = 100

def generate_cusip():
    """Generate cusip."""

    characters = list(string.ascii_uppercase+string.digits)
    cusip = ("".join(np.random.choice(characters, size=12))).upper()

    return cusip

fisd_dummy = (pd.DataFrame({
    "complete_cusip": [generate_cusip() for _ in range(number_of_bonds)]
    })
    .assign(
        maturity=lambda x: np.random.choice(dummy_days, len(x), replace=True),
        offering_amt=lambda x: np.random.choice(
            np.arange(1, 101)*100000, len(x), replace=True
        )
    )
    .assign(
        offering_date=lambda x: (
            x["maturity"]-pd.to_timedelta(
                np.random.choice(np.arange(1, 26)*365, len(x), replace=True),
                unit="D"
            )
        )
    )
    .assign(
        dated_date=lambda x: (
            x["offering_date"]-pd.to_timedelta(
                np.random.choice(np.arange(-10, 11), len(x), replace=True),
                unit="D"
            )
        ),
        coupon=lambda x: np.random.choice(
            np.arange(0, 2.1, 0.1), len(x), replace=True
        )
    )
    .assign(
        last_interest_rate=lambda x: (
            x[["maturity", "offering_date", "dated_date"]].max(axis=1)
        ),
        issue_id=lambda x: x.index+1,
        issuer_id=lambda x: np.random.choice(
            np.arange(1, 251), len(x), replace=True
        ),
        sic_code=lambda x: (np.random.choice(
            np.arange(1, 10)*1000, len(x), replace=True)
        ).astype(str)
    )
)

(fisd_dummy
    .to_sql(name="fisd",
            con=tidy_finance,
            if_exists="replace",
            index=False)
)

100

## [Dummy trace_enhanced data](https://www.tidy-finance.org/python/wrds-dummy-data.html#dummy-trace_enhanced-data)

In [12]:
number_of_bonds = 100
start_date = pd.Timestamp("2014-01-01")
end_date = pd.Timestamp("2016-11-30")

bonds_panel = pd.DataFrame({
    "cusip_id": np.tile(
        fisd_dummy["complete_cusip"],
        (end_date-start_date).days+1
    ),
    "trd_exctn_dt":np.repeat(
        pd.date_range(start_date, end_date), len(fisd_dummy)
    )
})

trace_enhanced_dummy = (pd.concat([bonds_panel]*5)
    .assign(
        trd_exctn_tm = lambda x: pd.to_datetime(
            x["trd_exctn_dt"].astype(str)+" " +
            np.random.randint(0, 24, size=len(x)).astype(str)+":" +
            np.random.randint(0, 60, size=len(x)).astype(str)+":" +
            np.random.randint(0, 60, size=len(x)).astype(str)
        ),
        rptd_pr=np.random.uniform(10, 200, len(bonds_panel)*5),
        entrd_vol_qt=1000*np.random.choice(
            range(1, 21), len(bonds_panel)*5, replace=True
        ),
        yld_pt=np.random.uniform(-10, 10, len(bonds_panel)*5),
        rpt_side_cd=np.random.choice(
            ["B", "S"], len(bonds_panel)*5, replace=True
        ),
        cntra_mp_id=np.random.choice(
            ["C", "D"], len(bonds_panel)*5, replace=True
        )
    )
    .reset_index(drop=True)
)

(trace_enhanced_dummy
    .to_sql(name="trace_enhanced",
            con=tidy_finance,
            if_exists="replace",
            index=False)
)

532500

# [Exercises](https://www.tidy-finance.org/python/wrds-crsp-and-compustat.html#exercises)

## 1.

In [109]:
# Get crsp_monthly table
crsp_monthly = pd.read_sql_query(
    sql='SELECT * FROM crsp_monthly',
    con=tidy_finance,
    parse_dates='month')
# manually lag mktcap column
mktcap_lag = crsp_monthly['mktcap'].shift(1)
# why are there nan values in original column?
print(crsp_monthly['mktcap_lag'].compare(mktcap_lag).isna().sum())

self     99
other     0
dtype: int64


In [117]:
fecha_busqueda = pd.to_datetime("2003-01-31")
(crsp_monthly
    .query('month == @fecha_busqueda')
    .sort_values(by='permno')
)

Unnamed: 0,permno,gvkey,month,siccd,industry,exchcd,exchange,date,ret,ret_excess,shrout,altprc,mktcap,mktcap_lag
0,1,10001,2003-01-31,9794,Public,3,NASDAQ,2002-12-31 00:00:00,-1.000000,-1.000000,15572.481069,752.415572,1.171698e+07,
240,2,10002,2003-01-31,1746,Construction,3,NASDAQ,2002-12-31 00:00:00,-0.318671,-0.319585,28229.059133,351.189046,9.913736e+06,
480,3,10003,2003-01-31,6619,Finance,33,NASDAQ,2002-12-31 00:00:00,0.026657,0.024397,37935.369553,385.264061,1.461513e+07,
720,4,10004,2003-01-31,6041,Finance,1,NYSE,2002-12-31 00:00:00,0.034140,0.032324,28567.382632,553.269631,1.580547e+07,
960,5,10005,2003-01-31,4686,Transportation,1,NYSE,2002-12-31 00:00:00,-0.341532,-0.343174,15607.050442,677.585036,1.057510e+07,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22800,96,10096,2003-01-31,9884,Public,31,NYSE,2002-12-31 00:00:00,0.621609,0.619888,4376.443315,252.739474,1.106100e+06,
23040,97,10097,2003-01-31,7821,Services,3,NASDAQ,2002-12-31 00:00:00,-0.804507,-0.806231,8616.509513,278.873514,2.402916e+06,
23280,98,10098,2003-01-31,2021,Manufacturing,3,NASDAQ,2002-12-31 00:00:00,-0.231012,-0.232311,45160.982936,961.714508,4.343197e+07,
23520,99,10099,2003-01-31,9347,Public,3,NASDAQ,2002-12-31 00:00:00,-0.264879,-0.267014,47818.441230,942.507561,4.506924e+07,


In [118]:
print(
    "The answer is because the way to move the market\n"
    "capitalization has been done for each of the 100\n"
    "'permno' and the month, and because the first month\n"
    "is December 2002, the first month is null.")

The answer is because the way to move the market
capitalization has been done for each of the 100
'permno' and the month, and because the first month
 is December 2002, the first month is null.
