# Import libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read input data
dataForRegression = pd.read_csv("demographic_icd_ML_aditional_f_column.txt") 
dataForRegression.shape

# Read the input data

In [None]:
dataForRegression.head(1)

In [None]:
dataForRegression.columns

In [None]:
# Drop columns
dataForRegression.drop(columns=['Unnamed: 0', 'Race', 'M_Status', 'State', 'predicted_f_code'], inplace=True)

# Print the first few rows
dataForRegression.head(1)

# Reformat the data for the model

In [None]:
# Convert categorical variables to numeric

# Map 'Y' to 1 and 'N' to 0 for Hispanic variable
dataForRegression['Hispanic'] = dataForRegression['Hispanic'].map({'Y': 1, 'N': 0})

# Map 'M' to 1 and 'F' to 0 for Sex variable
dataForRegression['Sex'] = dataForRegression['Sex'].map({'M': 1, 'F': 0})

# Convert Age column to numeric
dataForRegression['Age'] = pd.to_numeric(dataForRegression['Age'])

# Convert FPL column to string (assuming it's not already)
dataForRegression['FPL'] = dataForRegression['FPL'].astype(str)


In [None]:
# Print the first few rows
dataForRegression.head(1)

In [None]:
# Ensure FPL column is already converted to string (as you've mentioned earlier)
dataForRegression['FPL'] = dataForRegression['FPL'].astype(str)

# Rename columns
dataForRegression.rename(columns={
    dataForRegression.columns[-10:][i]: ["Alcohol", "Opioid", "Cannabis", "Sedative_hypnoti_anxiolytic", "Cocaine",
                                          "OtherStimulant", "Hallucinogen", "NicotineDependence", "Inhalant", 
                                          "Other_psychoactive_substance"][i] for i in range(10)
}, inplace=True)


In [None]:
# Print the first few rows
dataForRegression.head(1)

# Regression analysis

In [None]:
# Print the first few rows
dataForRegression.columns

In [None]:
import statsmodels.api as sm

# List of SUDs
sud_list = ["Alcohol", "Opioid", "Cannabis", "Sedative_hypnoti_anxiolytic", "Cocaine",
            "OtherStimulant", "Hallucinogen", "NicotineDependence", "Inhalant", "Other_psychoactive_substance"]

# Loop through each SUD
for sud in sud_list:
    print(sud)
    formula = sud + " ~ Sex + Age + Hispanic + FPL + Lang"
    model = sm.GLM.from_formula(formula, data=dataForRegression, family=sm.families.Binomial()).fit()
    print(model.summary())
    print("####")


In [None]:
# Summarize counts for each level of the specified columns
print(dataForRegression['Alcohol'].value_counts())
print(dataForRegression['Opioid'].value_counts())
print(dataForRegression['Cannabis'].value_counts())
print(dataForRegression['Sedative_hypnoti_anxiolytic'].value_counts())
print(dataForRegression['Cocaine'].value_counts())
print(dataForRegression['OtherStimulant'].value_counts())
print(dataForRegression['Hallucinogen'].value_counts())
print(dataForRegression['NicotineDependence'].value_counts())
print(dataForRegression['Inhalant'].value_counts())
print(dataForRegression['Other_psychoactive_substance'].value_counts())

# Save the results in a table

In [None]:
from statsmodels.stats.multitest import multipletests

# Create an empty dictionary to store results
results_dict = {}

# Loop through each SUD
for sud in sud_list:
    formula = sud + " ~ Sex + Age + Hispanic + FPL + Lang"
    model = sm.GLM.from_formula(formula, data=dataForRegression, family=sm.families.Binomial()).fit()

    # Extract coefficients and p-values
    coefficients = round(model.params, 3)
    odds_ratio = round(np.exp(coefficients), 3)
    p_values = round(model.pvalues, 3)
    
    # Apply multiple testing correction (Benjamini-Hochberg)
    adjusted_p_values = multipletests(p_values, method='fdr_bh')[1]
    adjusted_p_values = [round(val, 3) for val in adjusted_p_values]  # Round each element
    
    # Combine results into a DataFrame
    result_df = pd.DataFrame({
        "Predictor": coefficients.index,
        "Coefficient": coefficients.values,
        "Odds_Ratio": odds_ratio.values,
        "P_value": p_values.values,
        "Adjusted_P_Value": adjusted_p_values
    })
    
    # Add result_df to the results_dict
    results_dict[sud] = result_df

# Combine results for all SUDs into one DataFrame
all_results_df = pd.concat(results_dict.values(), keys=results_dict.keys())

# Write results to CSV
all_results_df.to_csv("./regression_analysis_results_fullPopulation.csv", index=False)


# Regression analysis for the hispanic subpopulation

In [None]:
# Filter the DataFrame
dataForRegressionHispanic = dataForRegression[dataForRegression['Hispanic'] == 1]

In [None]:
# Loop through each SUD
for sud in sud_list:
    print(sud)
    formula = sud + " ~ Sex + Age + FPL + Lang"
    model = sm.GLM.from_formula(formula, data=dataForRegressionHispanic, family=sm.families.Binomial()).fit()
    print(model.summary())
    print("####")

In [None]:
# Remove existing variable if it exists
try:
    del results_list
except NameError:
    pass

# Create an empty dictionary to store results
results_dict = {}

# Loop through each SUD
for sud in sud_list:
    formula = sud + " ~ Sex + Age + FPL + Lang"
    model = sm.GLM.from_formula(formula, data=dataForRegressionHispanic, family=sm.families.Binomial()).fit()

    # Extract coefficients and p-values
    coefficients = round(model.params, 3)
    odds_ratio = round(np.exp(coefficients), 3)
    p_values = round(model.pvalues, 3)
    
    # Apply multiple testing correction (Benjamini-Hochberg)
    adjusted_p_values = multipletests(p_values, method='fdr_bh')[1]
    adjusted_p_values = [round(val, 3) for val in adjusted_p_values]  # Round each element
    
    # Combine results into a DataFrame
    result_df = pd.DataFrame({
        "Predictor": coefficients.index,
        "Coefficient": coefficients.values,
        "Odds_Ratio": odds_ratio.values,
        "P_value": p_values.values,
        "Adjusted_P_Value": adjusted_p_values
    })
    
    # Add result_df to the results_dict
    results_dict[sud] = result_df

# Combine results for all SUDs into one DataFrame
hispanic_results_df = pd.concat(results_dict.values(), keys=results_dict.keys())

# Write results to CSV
hispanic_results_df.to_csv("./regression_analysis_results_hispanicPopulation.csv", index=False)


# Regression analysis for the non-hispanic subpopulation

In [None]:
# Filter the DataFrame
dataForRegressionNoHispanic = dataForRegression[dataForRegression['Hispanic'] == 0]


In [None]:
# Loop through each SUD
for sud in sud_list:
    print(sud)
    formula = sud + " ~ Sex + Age + FPL + Lang"
    model = sm.GLM.from_formula(formula, data=dataForRegressionNoHispanic, family=sm.families.Binomial()).fit()
    print(model.summary())
    print("####")


In [None]:
# Remove existing variable if it exists
try:
    del results_list
except NameError:
    pass

# Create an empty dictionary to store results
results_dict = {}

# Loop through each SUD
for sud in sud_list:
    formula = sud + " ~ Sex + Age + FPL + Lang"
    model = sm.GLM.from_formula(formula, data=dataForRegressionNoHispanic, family=sm.families.Binomial()).fit()

    # Extract coefficients and p-values
    coefficients = round(model.params, 3)
    odds_ratio = round(np.exp(coefficients), 3)
    p_values = round(model.pvalues, 3)
    
    # Apply multiple testing correction (Benjamini-Hochberg)
    adjusted_p_values = multipletests(p_values, method='fdr_bh')[1]
    adjusted_p_values = [round(val, 3) for val in adjusted_p_values]  # Round each element
    
    # Combine results into a DataFrame
    result_df = pd.DataFrame({
        "Predictor": coefficients.index,
        "Coefficient": coefficients.values,
        "Odds_Ratio": odds_ratio.values,
        "P_value": p_values.values,
        "Adjusted_P_Value": adjusted_p_values
    })
    
    # Add result_df to the results_dict
    results_dict[sud] = result_df

# Combine results for all SUDs into one DataFrame
non_hispanic_results_df = pd.concat(results_dict.values(), keys=results_dict.keys())

# Write results to CSV
non_hispanic_results_df.to_csv("./regression_analysis_results_non_hispanicPopulation.csv", index=False)


In [None]:
import sys
print("Python version:", sys.version)