In [6]:
import pandas as pd
import numpy as np

def generate_loan_price_data(
    start_date: str,
    end_date: str,
    price_range: tuple = (90, 105),
    n_tickers: int = 100,
    overlap_pct: float = 0.8,
    seed: int = 42,
):
    """
    Generate realistic loan price and balance data with high daily overlap.
    """
    np.random.seed(seed)
    all_dates = pd.date_range(start=start_date, end=end_date, freq='D')

    # Universe of possible tickers: LoanID0000 to LoanID9999
    all_possible_ids = set(range(10000))

    # Randomly initialize base tickers
    initial_ids = np.random.choice(list(all_possible_ids), n_tickers, replace=False)
    previous_ids = list(initial_ids)

    data = []

    for date in all_dates:
        n_overlap = int(len(previous_ids) * overlap_pct)
        n_new = len(previous_ids) - n_overlap

        # Choose overlap IDs from previous day
        overlapping_ids = np.random.choice(previous_ids, n_overlap, replace=False).tolist()

        # Exclude already used IDs to avoid duplication
        used_ids = set(previous_ids)
        available_ids = list(all_possible_ids - used_ids)
        new_ids = np.random.choice(available_ids, n_new, replace=False).tolist()

        today_ids = overlapping_ids + new_ids
        today_prices = np.random.uniform(price_range[0], price_range[1], len(today_ids))

        for loan_id, price in zip(today_ids, today_prices):
            ticker = f"LoanID{str(loan_id).zfill(4)}"
            balance = loan_id * 1000
            data.append({
                'Date': date,
                'LoanID': ticker,
                'Price': round(price, 2),
                'Balance': balance
            })

        previous_ids = today_ids

    return pd.DataFrame(data)


In [12]:
data = generate_loan_price_data(
    start_date='2023-01-01',
    end_date='2023-12-31',
    price_range=(50, 105),
    n_tickers=100,
    overlap_pct=1.0,
    seed=42
)

In [15]:
data.query( "LoanID == 'LoanID8328'" )

Unnamed: 0,Date,LoanID,Price,Balance
0,2023-01-01,LoanID8328,59.56,8328000
162,2023-01-02,LoanID8328,82.83,8328000
247,2023-01-03,LoanID8328,101.62,8328000
324,2023-01-04,LoanID8328,54.02,8328000
480,2023-01-05,LoanID8328,94.26,8328000
...,...,...,...,...
36037,2023-12-27,LoanID8328,102.76,8328000
36186,2023-12-28,LoanID8328,68.09,8328000
36225,2023-12-29,LoanID8328,52.72,8328000
36332,2023-12-30,LoanID8328,53.59,8328000


In [24]:
import numpy as np

In [76]:
df = data.copy()
df['Price_t'] = df['Price']
df['Price_t+1'] = df['Price'].shift(-1)
df['Date_t'] = df['Date']
df['Date_t+1'] = df['Date'].shift(-1)
df['Balance_t'] = df['Balance']
df['Date_t+1'] = df['Date'].shift(-1)

df['Price_diff'] = df['Price_t+1'] - df['Price_t']
df['Price_return'] = df['Price_diff'] / df['Price_t']


In [77]:
cutoffs = [ 70, 80, 90, 100 ]
cutoff_labels = [f"<{cutoffs[ 0 ]}" ] + [ f"{cutoffs[i]}-{cutoffs[i+1]}" for i in range(len(cutoffs)-1) ] + [ f">{cutoffs[-1]}" ]

In [78]:
cutoff_bins = [-np.inf] + cutoffs + [np.inf]

In [79]:
def get_bins_and_labels( cutoffs, include_inf=True ):
    if include_inf:
        labels = [f"<{cutoffs[0]}"] + [f"{cutoffs[i]}-{cutoffs[i+1]}" for i in range(len(cutoffs)-1)] + [f">{cutoffs[-1]}"]
        bins = [-np.inf] + cutoffs + [np.inf]
    else:
        bins = cutoffs
        labels = [f"{cutoffs[i]}-{cutoffs[i+1]}" for i in range(len(cutoffs)-1)]
    return bins, labels

In [70]:
cutoff_bins, cutoff_labels = get_bins_and_labels(cutoffs, include_inf=True)

In [80]:
df['Price_bucket_t'] = pd.cut(
    df['Price_t'],
    bins=cutoff_bins,
    labels=cutoff_labels,
    include_lowest=True
)

In [81]:
df['Price_bucket_t'].value_counts(dropna=False)

Price_bucket_t
<70       13368
80-90      6742
90-100     6586
70-80      6555
>100       3249
Name: count, dtype: int64

In [82]:
from quantbullet.dfutils import filter_df

In [83]:
bucket = '<70'
bucket_df = df[ df['Price_bucket_t'] == bucket ]

In [84]:
def wavg( group, value_col, weight_col):
    """
    Weighted average function.
    """
    return np.sum(group[value_col] * group[weight_col]) / np.sum(group[weight_col])

In [125]:
diff = ( bucket_df.groupby('Date')
    .apply(lambda x: wavg(x, 'Price_diff', 'Balance_t'), include_groups=False) )

diff.name = 'Wavg_price_diff'

returns = ( bucket_df.groupby('Date')
    .apply(lambda x: wavg(x, 'Price_return', 'Balance_t'), include_groups=False) )

returns.name = 'Wavg_price_return'

index = pd.Series(
    index = diff.index,
    data = np.random.randn(len(diff)),
    name = 'Index'
)

import statsmodels.api as sm
x = sm.add_constant(index)
model = sm.OLS(diff, x).fit()
print( model.params.loc['Index'] )

model = sm.OLS(returns, x).fit()
print(model.params.loc['Index'])

-0.18114970491832447
-0.0026602981427076926
