In [8]:
import sqlite3
import pandas as pd

# Connect to your database
conn = sqlite3.connect('/Users/aj/stat-arb-engine/data/pairs_database.db')

# 1. See what tables (pairs) are in the database
print("=== TABLES IN DATABASE ===")
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, conn)
print(f"Total pairs in database: {len(tables)}")
print("First 10 pair names:")
print(tables.head(10))

if len(tables) > 0:
    # 2. Load a specific pair to check the data
    pair_name = tables.iloc[0]['name']  # Get first pair name
    print(f"\n=== LOADING PAIR: {pair_name} ===")

    # Load the pair data without specifying index first
    df = pd.read_sql(f'SELECT * FROM {pair_name}', conn)
    print("Columns in database:", df.columns.tolist())
    
    # Handle the date column properly
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
        df.set_index('Date', inplace=True)
    elif 'index' in df.columns:  # Sometimes pandas saves index as 'index'
        df['index'] = pd.to_datetime(df['index'])
        df.set_index('index', inplace=True)
    else:
        print("No date column found, using default index")

    print(f"Shape: {df.shape}")
    if hasattr(df.index, 'min'):
        print(f"Date range: {df.index.min()} to {df.index.max()}")
    print("\nColumns:")
    print(df.columns.tolist())

    print("\n=== DATA SAMPLE ===")
    print("First 5 rows:")
    print(df.head())

    print("\nLast 5 rows:")
    print(df.tail())

    print("\n=== DATA QUALITY CHECK ===")
    print("Non-null counts:")
    key_cols = ['coint_p_value', 'slope', 'r_squared', 'z_residual', 'curr_residual']
    for col in key_cols:
        if col in df.columns:
            non_null = df[col].notna().sum()
            print(f"{col}: {non_null}/{len(df)} ({non_null/len(df)*100:.1f}%)")

    print("\nBasic statistics:")
    if 'z_residual' in df.columns:
        print(f"Z-residual range: {df['z_residual'].min():.3f} to {df['z_residual'].max():.3f}")
    if 'r_squared' in df.columns:
        print(f"R-squared range: {df['r_squared'].min():.3f} to {df['r_squared'].max():.3f}")
    if 'coint_p_value' in df.columns:
        print(f"Cointegration p-value range: {df['coint_p_value'].min():.3f} to {df['coint_p_value'].max():.3f}")

    # 3. Check a few more pairs
    print("\n=== CHECKING MULTIPLE PAIRS ===")
    for i in range(min(3, len(tables))):
        pair_name = tables.iloc[i]['name']
        try:
            test_df = pd.read_sql(f'SELECT * FROM {pair_name} LIMIT 5', conn)
            print(f"{pair_name}: {len(test_df)} rows loaded successfully")
        except Exception as e:
            print(f"{pair_name}: ERROR - {e}")
else:
    print("No tables found in database!")

conn.close()
print("\nDatabase connection closed.")

=== TABLES IN DATABASE ===
Total pairs in database: 10
First 10 pair names:
            name
0   pair_MMM_AOS
1  pair_MMM_ALLE
2   pair_MMM_AME
3   pair_MMM_ADP
4  pair_MMM_AXON
5    pair_MMM_BA
6    pair_MMM_BR
7  pair_MMM_BLDR
8  pair_MMM_CHRW
9  pair_MMM_CARR

=== LOADING PAIR: pair_MMM_AOS ===
Columns in database: ['index', 'MMM_price', 'AOS_price', 'MMM_sector', 'AOS_sector', 'MMM_ln_price', 'AOS_ln_price', 'nextErn1', 'nextErn2', 'count', 'coint_p_value', 'slope', 'y_intercept', 'r_squared', 'y_implied', 'curr_residual', 'z_residual', 'ratio', 'logRatio', 'avg_ratio', 'std_dev', 'z_ratio', 'Mkt_cap1', 'Mkt_cap2', '30D_Turnover1', '30D_Turnover2']
Shape: (2515, 25)
Date range: 2015-06-16 00:00:00 to 2025-06-13 00:00:00

Columns:
['MMM_price', 'AOS_price', 'MMM_sector', 'AOS_sector', 'MMM_ln_price', 'AOS_ln_price', 'nextErn1', 'nextErn2', 'count', 'coint_p_value', 'slope', 'y_intercept', 'r_squared', 'y_implied', 'curr_residual', 'z_residual', 'ratio', 'logRatio', 'avg_ratio', 'std

In [7]:
import sqlite3
import pandas as pd

# Connect to database
conn = sqlite3.connect('/Users/aj/stat-arb-engine/data/pairs_database.db')

# Get first table name
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, conn)

if len(tables) > 0:
    # Get first pair name
    first_pair = tables.iloc[0]['name']
    print(f"Loading pair: {first_pair}")
    
    # Load the data
    df = pd.read_sql(f'SELECT * FROM {first_pair}', conn)
    
    # Fix the date index
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
        df.set_index('Date', inplace=True)
    elif 'index' in df.columns:
        df['index'] = pd.to_datetime(df['index'])
        df.set_index('index', inplace=True)
    
    print(f"Successfully loaded DataFrame with shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    
    # Show the DataFrame (this will display it)
    display(df)
    
else:
    print("No pairs found in database!")

conn.close()

Loading pair: pair_MMM_AOS
Successfully loaded DataFrame with shape: (2515, 25)
Columns: ['MMM_price', 'AOS_price', 'MMM_sector', 'AOS_sector', 'MMM_ln_price', 'AOS_ln_price', 'nextErn1', 'nextErn2', 'count', 'coint_p_value', 'slope', 'y_intercept', 'r_squared', 'y_implied', 'curr_residual', 'z_residual', 'ratio', 'logRatio', 'avg_ratio', 'std_dev', 'z_ratio', 'Mkt_cap1', 'Mkt_cap2', '30D_Turnover1', '30D_Turnover2']


Unnamed: 0_level_0,MMM_price,AOS_price,MMM_sector,AOS_sector,MMM_ln_price,AOS_ln_price,nextErn1,nextErn2,count,coint_p_value,...,z_residual,ratio,logRatio,avg_ratio,std_dev,z_ratio,Mkt_cap1,Mkt_cap2,30D_Turnover1,30D_Turnover2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-06-16,130.837,36.6150,Industrials,Industrials,4.873952,3.600458,2015-07-23 00:00:00,2015-07-23 00:00:00,1,,...,,3.573317,1.353703,,,,99268.0236,6532.1522,1454127.633,6.143083e+05
2015-06-17,131.222,36.6800,Industrials,Industrials,4.876891,3.602232,2015-07-23 00:00:00,2015-07-23 00:00:00,2,,...,,3.577481,1.353853,,,,99559.8205,6543.7482,1457161.067,6.126437e+05
2015-06-18,133.454,36.8350,Industrials,Industrials,4.893757,3.606448,2015-07-23 00:00:00,2015-07-23 00:00:00,3,,...,,3.623022,1.356946,,,,101253.5110,6571.4004,1472254.367,6.266487e+05
2015-06-19,132.894,36.8250,Industrials,Industrials,4.889552,3.606177,2015-07-23 00:00:00,2015-07-23 00:00:00,4,,...,,3.608798,1.355882,,,,100828.5025,6569.6164,1515874.733,6.532073e+05
2015-06-22,133.504,36.9700,Industrials,Industrials,4.894131,3.610107,2015-07-23 00:00:00,2015-07-23 00:00:00,5,,...,,3.611144,1.355675,,,,101291.5714,6595.4845,1484058.167,6.514219e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-06-09,201.696,75.9573,Industrials,Industrials,5.306762,4.330171,,,2511,0.914359,...,-0.620743,2.655387,1.225532,1.219570,0.009736,0.612338,77621.8978,9212.9292,3004561.833,1.866400e+06
2025-06-10,202.856,77.0235,Industrials,Industrials,5.312496,4.344111,,,2512,0.580582,...,-0.230084,2.633690,1.222919,1.219561,0.009730,0.345174,78068.5884,9342.2484,3025540.233,1.852634e+06
2025-06-11,205.821,76.6603,Industrials,Industrials,5.327007,4.339384,,,2513,0.528601,...,-0.637006,2.684845,1.227595,1.219496,0.009693,0.835528,79209.5329,9298.1946,3058442.667,1.802068e+06
2025-06-12,202.451,75.2778,Industrials,Industrials,5.310498,4.321185,,,2514,0.618111,...,-0.989501,2.689385,1.228945,1.219484,0.009682,0.977183,77912.5158,9130.5059,3025333.700,1.793175e+06
