# Propagation of Disease-Demographic Co-occurrences to Model Logits


### **Set up**

**Just run this part**


In [1]:
import os
import pandas as pd
import numpy as np
import json
import sys
import plotly.express as px
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import kendalltau, rankdata
import rbo

In [2]:
project_root_relative_path = "../.."  # Adjust this path as necessary

# Use os.getcwd() to get the current working directory of the notebook
current_dir = os.getcwd()

# Construct the path to the root of the Cross-Care project
cross_care_root = os.path.normpath(
    os.path.join(current_dir, project_root_relative_path)
)

# Add the Cross-Care root to sys.path to allow imports
if cross_care_root not in sys.path:
    sys.path.append(cross_care_root)

print("Project root added to sys.path:", cross_care_root)

from dicts.dict_medical import medical_keywords_dict

Project root added to sys.path: /home/legionjgally/Desktop/mit/Cross-Care


In [3]:
race_categories = [
    "pacific islander",
    "hispanic",
    "asian",
    "indigenous",
    "white",
    "black",
]
gender_categories = [
    "male",
    "female",
    "nonbinary",
]

In [4]:
def load_and_combine_logits(
    models,
    root_path,
    dataset,
    demographic,
    demographic_categories,
    debug=False,
):
    combined_df = pd.DataFrame()

    for model_name in models:
        # Generate the path for the current model's logits data
        logits_data_path = f"{root_path}/output_{dataset}/logits/{model_name.replace('/', '_')}/logits_{demographic}.json"

        # Check if the file exists to avoid errors
        if os.path.exists(logits_data_path):
            with open(logits_data_path, "r") as f:
                data = json.load(f)

            # Convert the data into a DataFrame
            logit_df = pd.DataFrame(data)

            # Add a column for the model name
            logit_df["model_name"] = model_name

            # Append the current DataFrame to the combined DataFrame
            combined_df = pd.concat([combined_df, logit_df], ignore_index=True)
        else:
            print(f"Logits data file not found for model: {model_name}")

    disease_names = list(combined_df.keys())
    disease_names.remove("model_name")

    reshaped_df = reshape_logit_df(
        combined_df, models, disease_names, demographic_categories
    )
    reshaped_df["model_size"] = reshaped_df["model_name"].map(model_size_mapping)
    reshaped_df["model_size"] = reshaped_df["model_size"].astype(int)

    return reshaped_df


def reshape_logit_df(combined_df, models, disease_names, demographic_categories):
    # Initialize an empty list to hold the reshaped data
    reshaped_data = []

    # Assuming combined_logits_df is structured with diseases as keys and each key has a list of lists with [category, logit]
    for model in models:
        for disease in disease_names:
            for category in demographic_categories:
                for entry in combined_df[disease]:
                    if entry[0] == category:
                        reshaped_data.append(
                            {
                                "disease": disease,
                                "demographic": category,
                                "logit_value": entry[1],
                                "model_name": model,
                            }  # hacky
                        )

    # Convert the reshaped data into a DataFrame
    reshaped_df = pd.DataFrame(reshaped_data)

    return reshaped_df

In [5]:
def replace_disease_codes(df, medical_keywords_dict):
    for index, row in df.iterrows():
        disease = row["disease"]
        # Check if the last two characters are '.0'
        if isinstance(disease, str) and disease.endswith(".0"):
            # Lookup the code in the dictionary and get the first name
            name_list = medical_keywords_dict.get(disease)
            if name_list:
                df.at[index, "disease"] = name_list[0]
    return df


def load_cooccurrence_data(cross_care_root, dataset, demographic, debug=False):
    # Load co-occurrence data
    counts_data_path = f"{cross_care_root}/output_{dataset}/aggregated_counts/aggregated_{demographic}_counts.csv"
    counts_df = pd.read_csv(counts_data_path)

    if debug:
        counts_df = counts_df.head(10)

    demographic_mapping = {
        "white/caucasian": "white",
        "black/african american": "black",
        "hispanic/latino": "hispanic",
        "asian": "asian",
        "native american/indigenous": "indigenous",
        "pacific islander": "pacific islander",
    }

    counts_df = counts_df.rename(columns=demographic_mapping)
    counts_df = counts_df.rename(
        columns={"Disease": "disease", "mention count": "mention_count"}
    )

    counts_df = replace_disease_codes(counts_df, medical_keywords_dict)
    # drop unnamed columns
    counts_df = counts_df.loc[:, ~counts_df.columns.str.contains("^Unnamed")]

    # Melting the dataframe to reshape it
    counts_df_long = pd.melt(
        counts_df,
        id_vars=["disease"],
        var_name="demographic",
        value_name="mention_count",
    )

    return counts_df_long

In [6]:
def format_data(combined_df):
    # NUMERICS
    combined_df["mention_count"] = pd.to_numeric(
        combined_df["mention_count"], errors="coerce"
    )

    combined_df["logit_value"] = pd.to_numeric(
        combined_df["logit_value"], errors="coerce"
    )
    combined_df["model_size"] = pd.to_numeric(
        combined_df["model_size"], errors="coerce"
    )

    # CATEGORICALS
    combined_df["demographic"] = combined_df["demographic"].astype("category")
    combined_df["disease"] = combined_df["disease"].astype("category")

    # create basic stats_df
    combined_df.dropna(inplace=True)
    stats_df = combined_df.copy()

    # sort by disease, model_size
    stats_df = stats_df.sort_values(by=["disease", "model_size"])

    return stats_df

In [7]:
def add_normalization_by_total_disease_counts(counts_df, total_counts_csv):
    # Load total disease counts
    total_counts_df = pd.read_csv(total_counts_csv)

    # Merge the total counts into the co-occurrence DataFrame
    counts_df = pd.merge(counts_df, total_counts_df, on="disease", how="left")

    # Perform normalization and add as a new column
    counts_df["normalized_by_total_counts"] = (
        counts_df["mention_count"] / counts_df["total_count"]
    ) * 100

    # You may choose to drop the 'total_count' column if it's no longer needed
    counts_df = counts_df.drop(columns=["total_count"])

    return counts_df


def add_normalization_by_disease_demo_mentions(counts_df):
    # Calculate the total mention count across all demographics for each disease
    total_by_disease = (
        counts_df.groupby("disease")["mention_count"]
        .sum()
        .reset_index(name="total_demo_count")
    )

    # Merge this total back into the original DataFrame
    counts_df = pd.merge(counts_df, total_by_disease, on="disease", how="left")

    # Perform normalization and add as a new column
    counts_df["normalized_by_demo_mentions"] = (
        counts_df["mention_count"] / counts_df["total_demo_count"]
    ) * 100

    # You may choose to drop the 'total_demo_count' column if it's no longer needed
    counts_df = counts_df.drop(columns=["total_demo_count"])

    return counts_df

In [8]:
def calculate_ranks(values):
    return rankdata(-values, method="ordinal")  # Negative for descending order


def analyze_rank_results(combined_df):

    rank_df = combined_df.copy()

    results = []

    unique_diseases = rank_df["disease"].unique()
    unique_model_sizes = rank_df["model_size"].unique()

    for disease in unique_diseases:
        for model_size in unique_model_sizes:
            sub_df = rank_df[
                (rank_df["disease"] == disease) & (rank_df["model_size"] == model_size)
            ].sort_values(by="demographic")

            # Convert series to numpy arrays for compatibility with RBO package
            mention_counts = sub_df["mention_count"].values
            logits = sub_df["logit_value"].values

            mention_ranks = calculate_ranks(mention_counts)
            logit_ranks = calculate_ranks(logits)

            # Calculate Kendall's Tau
            kendall_tau, _ = kendalltau(mention_counts, logits)

            # Calculate RBO
            rbo_score = rbo.RankingSimilarity(
                mention_ranks.tolist(), logit_ranks.tolist()
            ).rbo()

            # Collect results
            results.append(
                {
                    "disease": disease,
                    "model_size": model_size,
                    "kendall_tau": kendall_tau,
                    "rbo_score": rbo_score,
                }
            )

    # Convert results to DataFrame for easy viewing
    results_df = pd.DataFrame(results)

    return results_df

### **Default settings**

Run all models <br>
Demographics= Race


In [9]:
dataset = "pile"
demographic = "race"
debug = False

models = [
    "EleutherAI/pythia-70m-deduped",
    "EleutherAI/pythia-160m-deduped",
    "EleutherAI/pythia-410m-deduped",
    "EleutherAI/pythia-1b-deduped",
    "EleutherAI/pythia-2.8b-deduped",
    "EleutherAI/pythia-6.9b-deduped",
    # "EleutherAI/pythia-12b-deduped",
    # "state-spaces/mamba-130m",
    # "state-spaces/mamba-370m",
    # "state-spaces/mamba-790m",
    # "state-spaces/mamba-1.4b",
    # "state-spaces/mamba-2.8b-slimpj",
    # "state-spaces/mamba-2.8b"
]

model_size_mapping = {
    "EleutherAI/pythia-70m-deduped": 70,
    "EleutherAI/pythia-160m-deduped": 160,
    "EleutherAI/pythia-410m-deduped": 410,
    "EleutherAI/pythia-1b-deduped": 1000,  # 1 billion parameters = 1000 million
    "EleutherAI/pythia-2.8b-deduped": 2800,  # 2.8 billion parameters = 2800 million
    "EleutherAI/pythia-6.9b-deduped": 6900,  # 6.9 billion parameters = 6900 million
    "EleutherAI/pythia-12b-deduped": 12000,  # 12 billion parameters = 12000 million
    "state-spaces/mamba-130m": 130,
    "state-spaces/mamba-370m": 370,
    "state-spaces/mamba-790m": 790,
    "state-spaces/mamba-1.4b": 1400,
    "state-spaces/mamba-2.8b-slimpj": 2800,
    "state-spaces/mamba-2.8b": 2800,
}

In [10]:
# set demographic categories and disease names
if demographic == "race":
    demographic_categories = race_categories
else:
    demographic_categories = gender_categories

## Demographic-disease Logits across models


In [11]:
combined_logits_df = load_and_combine_logits(
    models,
    cross_care_root,
    dataset,
    demographic,
    demographic_categories,
    debug,
)
combined_logits_df.head()

Unnamed: 0,disease,demographic,logit_value,model_name,model_size
0,hiv/aids,pacific islander,-169.595222,EleutherAI/pythia-70m-deduped,70
1,hiv/aids,pacific islander,-191.432622,EleutherAI/pythia-70m-deduped,70
2,hiv/aids,pacific islander,-208.036883,EleutherAI/pythia-70m-deduped,70
3,hiv/aids,pacific islander,-218.052256,EleutherAI/pythia-70m-deduped,70
4,hiv/aids,pacific islander,-218.497273,EleutherAI/pythia-70m-deduped,70


In [12]:
# Now, use Plotly Express to create the visualization
fig = px.bar(
    combined_logits_df,
    x="disease",
    y="logit_value",
    color="demographic",
    barmode="group",
    title="Logit Values by Demographic and Gender Categories for Various Diseases",
)

# Customizing the layout
fig.update_layout(
    xaxis_title="Disease",
    yaxis_title="Logit Value",
    legend_title="Categories",
    autosize=False,
    width=1400,
    height=800,
)

fig.update_xaxes(categoryorder="total descending")
fig.show()

## Co-occurrences of Demographic-diseases in The Pile


### Normalization by Total Mentions of Disease

Normalization of mention counts relative to the total mentions of the disease across all demographics provides a way to assess the prominence of a disease within specific demographic groups in comparison to its overall discussion frequency.

**Formula:**
The normalization formula for this approach is:

$$
\text{Normalized Mention Count} = \left( \frac{\text{Mention Count of Disease with Demographic}}{\text{Total Mention Count of Disease with and without demographics}} \right) \times 100
$$

### Normalization by Total Mentions of Disease When Any Demographic is Mentioned

This method focuses on normalizing the mention counts of a disease within demographic-specific discussions against the total mentions of that disease when any demographic term is mentioned. It highlights how frequently a disease is associated with specific demographic groups in the context of broader demographic discussions.

**Formula:**
The normalization formula used is:

$$
\text{Normalized Mention Count} = \left( \frac{\text{Mention Count of Disease with Demographic}}{\text{Total Mention Count of Disease with Any Demographic}} \right) \times 100
$$

### No Normalization (Raw Counts)

In some analyses, raw mention counts are used without any normalization. This approach provides the absolute frequency of disease mentions within demographic-specific contexts or overall, without adjusting for disparities in mention volumes across different demographics or diseases.

**Explanation:**
No normalization means the raw mention counts are directly compared or analyzed. This can be useful for understanding the volume of discussion but may require careful interpretation when comparing diseases or demographics with widely varying baseline mention frequencies.


In [13]:
# Assuming counts_df is already loaded
counts_df = load_cooccurrence_data(cross_care_root, dataset, demographic, debug)

# # Adding normalization by total disease counts
# counts_df = add_normalization_by_total_disease_counts(
#     counts_df, "path/to/total_disease_counts.csv"
# )

# Adding normalization by any disease-demographic mention
counts_df = add_normalization_by_disease_demo_mentions(counts_df)

In [14]:
# Now, use Plotly Express to create the visualization
fig = px.bar(
    counts_df,
    x="disease",
    y="mention_count",
    color="demographic",
    barmode="group",
    title="Total Co-occurrences by Demographic and Gender Categories for Various Diseases",
)

# Customizing the layout
fig.update_layout(
    xaxis_title="Disease",
    yaxis_title="Co-occurrence Count",
    legend_title="Categories",
    autosize=False,
    width=1400,
    height=800,
)

fig.update_xaxes(categoryorder="total descending")
fig.show()

In [15]:
# Now, use Plotly Express to create the visualization
fig = px.bar(
    counts_df,
    x="disease",
    y="normalized_by_demo_mentions",
    color="demographic",
    barmode="group",
    title="Relative demographic Co-occurrences",
)

# Customizing the layout
fig.update_layout(
    xaxis_title="Disease",
    yaxis_title="Relative demographic Co-occurrence (%)",
    legend_title="Categories",
    autosize=False,
    width=1400,
    height=800,
)

fig.update_xaxes(categoryorder="total descending")
fig.show()

## Compare Co-occurrences to Model Logits


In [16]:
# Merge the transformed counts DataFrame with the logits DataFrame
combined_df = pd.merge(
    combined_logits_df, counts_df, on=["disease", "demographic"], how="inner"
)

format_data(combined_df)

combined_df

Unnamed: 0,disease,demographic,logit_value,model_name,model_size,mention_count,normalized_by_demo_mentions
0,hiv/aids,pacific islander,-169.595222,EleutherAI/pythia-70m-deduped,70,33654,0.447564
1,hiv/aids,pacific islander,-191.432622,EleutherAI/pythia-70m-deduped,70,33654,0.447564
2,hiv/aids,pacific islander,-208.036883,EleutherAI/pythia-70m-deduped,70,33654,0.447564
3,hiv/aids,pacific islander,-218.052256,EleutherAI/pythia-70m-deduped,70,33654,0.447564
4,hiv/aids,pacific islander,-218.497273,EleutherAI/pythia-70m-deduped,70,33654,0.447564
...,...,...,...,...,...,...,...
19867,arrhythmia,black,-127.072749,EleutherAI/pythia-6.9b-deduped,6900,1735588,33.055898
19868,arrhythmia,black,-139.556743,EleutherAI/pythia-6.9b-deduped,6900,1735588,33.055898
19869,arrhythmia,black,-143.157484,EleutherAI/pythia-6.9b-deduped,6900,1735588,33.055898
19870,arrhythmia,black,-145.104606,EleutherAI/pythia-6.9b-deduped,6900,1735588,33.055898


In [28]:
# Plotting with Plotly Express
fig = px.scatter(
    combined_df,
    x="logit_value",
    y="mention_count",
    color="demographic",
    hover_data=["disease", "model_name"],
    title="Comparison of Logit Values and Pile Co-occurrence Counts by Demographic",
    labels={
        "mention_count": "Co-Occurrence Count (log scale)",
        "logit_value": "Logit Value",
    },
    log_y=True,
)

# Customize for clarity
fig.update_traces(
    marker=dict(size=10, line=dict(width=2, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
)
fig.update_layout(legend_title_text="Demographic", hovermode="closest")

fig.show()

In [29]:
# List of diseases to plot
diseases_to_plot = combined_df["disease"].unique()[:1]  # Adjust as needed

for disease in diseases_to_plot:
    disease_data = combined_df[combined_df["disease"] == disease]
    fig = px.scatter(
        disease_data,
        x="logit_value",
        y="mention_count",
        color="demographic",
        hover_data=["model_name"],
        title=f"Regression of Logit Values and Pile Co-occurrence Counts for {disease}",
        labels={
            "mention_count": "Co-Occurrence Count (log scale)",
            "logit_value": "Logit Value",
        },
        log_y=True,
        trendline="ols",  # This adds a regression line for the data points of each disease
    )

    # Customize for clarity
    fig.update_traces(
        marker=dict(size=10, line=dict(width=2, color="DarkSlateGrey")),
        selector=dict(mode="markers"),
    )
    fig.update_layout(legend_title_text="Demographic", hovermode="closest")

    fig.show()


divide by zero encountered in scalar divide


divide by zero encountered in scalar divide


divide by zero encountered in scalar divide


divide by zero encountered in scalar divide


divide by zero encountered in scalar divide


divide by zero encountered in scalar divide



## Rank based eval

### Kendall Tau

Kendall Tau (\(\tau\)) is a correlation coefficient used to measure the ordinal association between two rankings. It evaluates the agreement between the ranks assigned to the same set of items in two different rankings.

- **Value Range**: -1 to 1
  - **1**: Perfect agreement
  - **-1**: Perfect disagreement
  - **0**: No association

### Rank Biased Overlap (RBO)

RBO assesses the similarity between two ranked lists, particularly useful when the lists are of different lengths and the top-ranked items are of most interest. It accounts for the order of items and emphasizes early ranks.

- **Value Range**: 0 to 1
  - **1**: Identical rankings
  - **0**: No overlap


In [31]:
rank_results = analyze_rank_results(combined_df)
rank_results.head(20)

Unnamed: 0,disease,model_size,kendall_tau,rbo_score
0,hiv/aids,70,0.322323,0.535952
1,hiv/aids,160,0.322323,0.535952
2,hiv/aids,410,0.322323,0.535952
3,hiv/aids,1000,0.322323,0.535952
4,hiv/aids,2800,0.322323,0.535952
5,hiv/aids,6900,0.322323,0.535952
6,covid-19,70,0.312036,0.476589
7,covid-19,160,0.312036,0.476589
8,covid-19,410,0.312036,0.476589
9,covid-19,1000,0.312036,0.476589


In [40]:
# plot distribution of kendall tau scores
fig_kendall_tau = px.line(
    rank_results,
    x="model_size",
    y="kendall_tau",
    color="disease",
    title="Kendall Tau Scores by Disease and Model Size",
    labels={
        "model_size": "Model Size",
        "kendall_tau": "Kendall Tau Score",
        "disease": "Disease",
    },
    markers=True,  # Add markers for each point
)

fig_kendall_tau.update_layout(xaxis_type="log", xaxis_title="Model Size (log scale)")
fig_kendall_tau.show()

In [41]:
fig_rbo_score = px.line(
    rank_results,
    x="model_size",
    y="rbo_score",
    color="disease",
    title="RBO Scores by Disease and Model Size",
    labels={"model_size": "Model Size", "rbo_score": "RBO Score", "disease": "Disease"},
    markers=True,  # Add markers for each point
)

fig_rbo_score.update_layout(xaxis_type="log", xaxis_title="Model Size (log scale)")
fig_rbo_score.show()

## Correlations


In [20]:
# Calculate Pearson correlation
correlation_matrix = combined_df[
    ["logit_value", "mention_count", "normalized_by_demo_mentions"]
].corr()

correlation_matrix

Unnamed: 0,logit_value,mention_count,normalized_by_demo_mentions
logit_value,1.0,0.209676,0.311032
mention_count,0.209676,1.0,0.299107
normalized_by_demo_mentions,0.311032,0.299107,1.0


In [21]:
# Function to calculate correlation per demographic
def calculate_correlation_per_demographic(dataframe):
    demographic_groups = dataframe["demographic"].unique()
    correlation_results = {}

    for demographic in demographic_groups:
        subset = dataframe[dataframe["demographic"] == demographic]
        correlation_matrix = subset[
            ["logit_value", "mention_count", "normalized_by_demo_mentions"]
        ].corr()
        correlation_results[demographic] = correlation_matrix

    return correlation_results


# Calculate and print correlation per demographic
correlation_results = calculate_correlation_per_demographic(combined_df)
for demographic, correlation_matrix in correlation_results.items():
    print(f"Correlation Matrix for {demographic}:")
    print(correlation_matrix, "\n")

Correlation Matrix for pacific islander:
                             logit_value  mention_count  \
logit_value                     1.000000       0.209401   
mention_count                   0.209401       1.000000   
normalized_by_demo_mentions     0.187443       0.266781   

                             normalized_by_demo_mentions  
logit_value                                     0.187443  
mention_count                                   0.266781  
normalized_by_demo_mentions                     1.000000   

Correlation Matrix for hispanic:
                             logit_value  mention_count  \
logit_value                     1.000000       0.217828   
mention_count                   0.217828       1.000000   
normalized_by_demo_mentions     0.102831       0.187136   

                             normalized_by_demo_mentions  
logit_value                                     0.102831  
mention_count                                   0.187136  
normalized_by_demo_mentions          

## OLS


### Does co-occurrence have a good fit to logit?

This purely fits the **mention count against the logit value**, considering data across all demographics and for all diseases. <br>
It **does not differentiate** between demographics or specific diseases in its analysis; instead, it assesses the **overall relationship** between how frequently diseases are mentioned (mention count) and their logit values across the entire dataset.<br>
This approach provides a broad view of the impact of mention count on logit values without dissecting the effects within specific demographic groups or for individual diseases.


In [22]:
mention_column = "normalized_by_demo_mentions"

# Assuming df is your DataFrame and it contains 'logit_value' and 'mention_count'
X = sm.add_constant(combined_df[mention_column])  # Independent variable
y = combined_df["logit_value"]  # Dependent variable

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            logit_value   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.097
Method:                 Least Squares   F-statistic:                     2128.
Date:                Mon, 19 Feb 2024   Prob (F-statistic):               0.00
Time:                        18:28:10   Log-Likelihood:                -89247.
No. Observations:               19872   AIC:                         1.785e+05
Df Residuals:                   19870   BIC:                         1.785e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

#### Controlling for model size, how does co-occurrence fit to logit?

Next, we'll include model_size, this step will help us understand how co-occurrence relates to logit values when the size of the model is accounted for.


In [23]:
X = sm.add_constant(combined_df[[mention_column, "model_size"]])
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            logit_value   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.097
Method:                 Least Squares   F-statistic:                     1064.
Date:                Mon, 19 Feb 2024   Prob (F-statistic):               0.00
Time:                        18:28:10   Log-Likelihood:                -89247.
No. Observations:               19872   AIC:                         1.785e+05
Df Residuals:                   19869   BIC:                         1.785e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

#### Are there demographic groups that have stronger/weaker co-occurrence fits to logit?

Finally, we'll explore the interaction between mention_count and demographic groups to see if the relationship between co-occurrence and logit values varies across different demographic groups.


In [24]:
# One-hot encode demographic groups
df_encoded = pd.get_dummies(
    combined_df, columns=["demographic", "disease"], drop_first=False
)

# drop demographic_white as the reference category
df_encoded = df_encoded.drop(columns=["demographic_white"])

df_encoded = df_encoded.apply(lambda x: x.astype(int) if x.dtype == "bool" else x)

interaction_terms = []

# Create interaction terms for mention_count and each demographic group
for demographic in df_encoded.columns[
    df_encoded.columns.str.startswith("demographic_")
]:
    interaction_term = df_encoded[mention_column] * df_encoded[demographic]
    interaction_terms.append(interaction_term.rename(f"mention_{demographic}"))

# For regression, ensure to exclude 'logit_value' from X, and set 'logit_value' as y
X = sm.add_constant(
    df_encoded.drop(["logit_value", mention_column, "model_name"], axis=1)
)
y = df_encoded["logit_value"]

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            logit_value   R-squared:                       0.754
Model:                            OLS   Adj. R-squared:                  0.753
Method:                 Least Squares   F-statistic:                     619.6
Date:                Mon, 19 Feb 2024   Prob (F-statistic):               0.00
Time:                        18:28:10   Log-Likelihood:                -76310.
No. Observations:               19872   AIC:                         1.528e+05
Df Residuals:                   19773   BIC:                         1.536e+05
Df Model:                          98                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------

## HLM


In [25]:
hml_stats_df = combined_df.copy()

md = smf.mixedlm(
    "logit_value ~ normalized_by_demo_mentions + C(demographic)",
    hml_stats_df,
    groups=hml_stats_df["model_size"],
)
mdf = md.fit()

print(mdf.summary())


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs



                       Mixed Linear Model Regression Results
Model:                     MixedLM          Dependent Variable:          logit_value
No. Observations:          19872            Method:                      REML       
No. Groups:                6                Scale:                       357.9181   
Min. group size:           3312             Log-Likelihood:              -86637.5860
Max. group size:           3312             Converged:                   Yes        
Mean group size:           3312.0                                                   
------------------------------------------------------------------------------------
                                    Coef.   Std.Err.    z    P>|z|  [0.025   0.975] 
------------------------------------------------------------------------------------
Intercept                          -149.112    2.022 -73.760 0.000 -153.074 -145.150
C(demographic)[T.black]               9.001    1.032   8.725 0.000    6.979   11.023
C(de


The Hessian matrix at the estimated parameter values is not positive definite.

