<a href="https://colab.research.google.com/github/aholloman79/A-Primer-on-Scientific-Programming-with-Python/blob/master/Python_Interactions%2C_Qualitative_Predictors_in_Healthcare_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Setting a random seed for reproducibility
np.random.seed(42)

# Simulating a large healthcare dataset
n_samples = 50000
age = np.random.randint(20, 80, size=n_samples)
cholesterol = np.random.uniform(150, 300, size=n_samples)
income = np.random.uniform(20000, 100000, size=n_samples)
access_to_healthcare = np.random.choice(['Low', 'Medium', 'High'], size=n_samples)
gender = np.random.choice(['Male', 'Female'], size=n_samples)

interaction_term = age * (cholesterol ** 0.5)
cholesterol_squared = cholesterol ** 2

# Response variable
health_outcome = (
    50 - 0.3 * age + 0.02 * cholesterol - 0.00001 * cholesterol_squared +
    0.0005 * income + 5 * np.random.randn(n_samples)
)

# Creating the DataFrame
data = pd.DataFrame({
    'age': age,
    'cholesterol': cholesterol,
    'cholesterol_squared': cholesterol_squared,
    'income': income,
    'access_to_healthcare': access_to_healthcare,
    'gender': gender,
    'health_outcome': health_outcome,
    'interaction_term': interaction_term
})

# Convert categorical variables to dummy variables
data = pd.get_dummies(data, columns=['access_to_healthcare', 'gender'], drop_first=False)

# Ensuring all columns are numeric
# This will address any hidden object-type data issues
data = data.apply(pd.to_numeric, errors='coerce')

# Check for any missing or invalid data
if data.isnull().sum().sum() > 0:
    print("Warning: Missing or invalid data found. Filling with 0.")
    data = data.fillna(0)  # Replace missing values with 0 for simplicity

# Verifying data types
print("Data types of DataFrame columns:")
print(data.dtypes)

# Defining predictors and response variable
predictors = [
    'age', 'cholesterol', 'cholesterol_squared', 'income',
    'access_to_healthcare_Low', 'access_to_healthcare_Medium', 'access_to_healthcare_High',
    'gender_Female', 'gender_Male', 'interaction_term'
]
response = 'health_outcome'

# Preparing X (predictors) and y (response)
X = sm.add_constant(data[predictors])  # Adding constant for intercept
y = data[response]

# Ensuring both X and y are numpy arrays with proper numeric types
X = np.asarray(X, dtype=float)
y = np.asarray(y, dtype=float)

# Fitting the regression model
model = sm.OLS(y, X).fit()

# Displaying the regression summary
print(model.summary())































Data types of DataFrame columns:
age                              int64
cholesterol                    float64
cholesterol_squared            float64
income                         float64
health_outcome                 float64
interaction_term               float64
access_to_healthcare_High         bool
access_to_healthcare_Low          bool
access_to_healthcare_Medium       bool
gender_Female                     bool
gender_Male                       bool
dtype: object
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.864
Model:                            OLS   Adj. R-squared:                  0.864
Method:                 Least Squares   F-statistic:                 3.971e+04
Date:                Fri, 13 Dec 2024   Prob (F-statistic):               0.00
Time:                        17:59:31   Log-Likelihood:            -1.5140e+05
No. Observations:               50000   AIC:      