In [1]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
import statsmodels.api as sm                      # <– NEW


In [2]:
def compute_slope_table(long_df, min_points=5):
    """
    Fit Rank = β0 + β1 * Records  (OLS)
    Returns a DataFrame with β1 per company & method.
    """
    out = []
    for (cmp, mth), g in long_df.groupby(['company', 'method']):
        g = g.dropna(subset=['records_in_cluster', 'rank'])
        if len(g) < min_points:
            continue                       # not enough data
        X = sm.add_constant(g['records_in_cluster'])
        model = sm.OLS(g['rank'], X).fit()
        out.append({
            'company': cmp,
            'method' : mth,
            'slope'  : model.params[1],    # β1
            'pvalue' : model.pvalues[1],
            'n'      : len(g)
        })
    return pd.DataFrame(out)


In [3]:
def plot_slope_bars(slope_df):
    if slope_df.empty:
        print("🚫  No slope data.")
        return
    
    # sort so most negative (strong effect) on one side
    slope_df = slope_df.sort_values('slope')
    
    plt.figure(figsize=(14,6))
    ax = sns.barplot(data=slope_df,
                     x='company', y='slope', hue='method',
                     palette='coolwarm')
    
    ax.axhline(0, color='gray', ls='--')
    ax.set_ylabel('OLS slope  β₁  (records → rank)')
    ax.set_xlabel('')
    ax.set_title('Effect of Cluster Size on Rank')
    plt.xticks(rotation=45, ha='right')
    
    # annotate slope & significance
    for p in ax.patches:
        cmp = p.get_x() + p.get_width()/2
        val = p.get_height()
        ax.annotate(f'{val:+.2f}',
                    (cmp, val),
                    ha='center', va='bottom', fontsize=8)
    plt.tight_layout()
    plt.show()


In [4]:
slope_df = compute_slope_table(long_df)
display(slope_df.head())          # optional table
plot_slope_bars(slope_df)


NameError: name 'long_df' is not defined