In [2]:
!pip install pandas scikit-learn matplotlib seaborn statsmodels



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [None]:
path = "data/BYAGE.txt"
op_path = "data/byage_processed.csv"

# Open the text file for reading
with open(path, "r") as file:
    # Read the contents of the file
    file_content = file.read()

# Replace " with empty string
file_content = file_content.replace('"', "")

# Write the modified content to a CSV file
with open(op_path, "w") as csv_file:
    csv_file.write(file_content)

### Create utility functions

In [None]:

def extract_max_age(age_range):
  """Extracts the maximum age from an age range string.

  Args:
      age_range: A string representing the age range (e.g., "1-4", "70+").

  Returns:
      The maximum age as an integer, or None if the format is invalid.
  """
  if '-' in age_range:
    return int(age_range.split('-')[-1])
  elif '+' in age_range:
    return int(age_range[:-1])
  elif '<' in age_range:
    return 1
  else:
    return int(age_range)

def get_dataframe(op_path, low_rate=True):
  """Reads and preprocesses the cancer data.

  Args:
      op_path: Path to the CSV file containing the cancer data.
      low_rate: If True, filter data to include only records with rate less than 100 (default).
                 If False, filter data to include only records with rate greater than 100.

  Returns:
      A pandas DataFrame containing the preprocessed data.
  """
  dtype = {
      'AGE': str,
      'CI_LOWER': np.float64,
      'CI_UPPER': np.float64,
      'COUNT': int,
      'EVENT_TYPE': str,
      'POPULATION': int,
      'RACE': str,
      'RATE': np.float64,
      'SEX': str,
      'SITE': str,
      'YEAR': str,
  }
  cols = {
      'AGE': 'Age',
      'CI_LOWER': 'ci_lower',
      'CI_UPPER': 'ci_upper',
      'COUNT': 'count',
      'EVENT_TYPE': 'event_type',
      'POPULATION': 'population',
      'RACE': 'race',
      'RATE': 'rate',
      'SEX': 'sex',
      'SITE': 'site',
      'YEAR': 'year',
  }

  df = pd.read_csv(op_path, header=1, delimiter='|', dtype=dtype, low_memory=False)
  df.columns = cols

  # Replace special characters with NaN
  df.replace('~|.', np.nan, inplace=True)

  # Handle missing values (consider imputation techniques instead of dropping)
  # df.dropna(inplace=True)

  # Convert columns to appropriate data types
  df['AGE'] = df['AGE'].apply(extract_max_age)
  df['YEAR'] = pd.to_numeric(df['YEAR'], errors='coerce')

  # Filter data based on year and race
  df = df[df['YEAR'] != '2016-2020']
  df = df[df['RACE'] != 'All Races']

  df['RATE'] = pd.to_numeric(df['RATE'], errors='coerce')
  # Filter data based on rate (optional)
  if low_rate:
    df = df[df['RATE'] < 100]
  else:
    df = df[df['RATE'] > 100]

  # Drop unnecessary columns
  df = df.drop(columns=['SITE'])

  return df


### Chi-Square Test

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Read the data using the get_dataframe function (assuming "cancer_data.csv" is your file path)
data = get_dataframe(op_path=op_path)

# Combine Mortality and Incidence (assuming this aligns with research question)
data["EVENT_TYPE"] = data["EVENT_TYPE"].replace({"Mortality": "Having Cancer", "Incidence": "Having Cancer"})

# Consider merging Race categories if necessary (e.g., using a custom function)
# ... (implement logic for merging race categories if needed)

# Filter data focusing on "Having Cancer"
filtered_data = data[data["EVENT_TYPE"] == "Having Cancer"]

# Contingency table (consider using filtered_data["RACE"] if not merging)
contingency_table = pd.crosstab(filtered_data["RACE"], filtered_data["RATE"])

# Chi-Square Test
chi2, pval, dof, expected = chi2_contingency(contingency_table.values)

# Print the chi-square test summary
print("Chi-Square Statistic:", chi2)
print("Degrees of Freedom:", dof)
print("p-value:", pval)

# Optionally, print the expected frequency table
print("Expected Frequencies:\n", expected)

# Interpretation
if pval < 0.05:
  print("There is a statistically significant association between race and having cancer (p-value:", pval, ").")
else:
  print("There is not sufficient evidence to conclude a statistically significant association (p-value:", pval, ").")


### ANOVA Test - OLS Result

In [None]:
import pandas as pd
from statsmodels.formula.api import ols

# Read the data using the get_dataframe function (assuming "cancer_data.csv" is your file path)
data = get_dataframe(op_path)

# Consider combining Mortality and Incidence (if necessary)
# ... (implement your logic for combining mortality and incidence if needed)

# Assuming data has columns "RATE", "RACE", and potentially "SEX" (optional factor)
formula = "RATE ~ RACE"  # Add "+ SEX" if including sex as a factor

# Fit the linear model
model = ols(formula, data=data).fit()

print(model.summary())


The report you provided shows the results of an Ordinary Least Squares (OLS) regression analysis, likely examining the relationship between race and cancer rates (assuming "RATE" is the dependent variable). Here's a breakdown of the key elements:

Model Summary:

Dep. Variable: "RATE" - This is the variable being predicted by the model.
R-squared: 0.043 - This indicates a weak positive association between the independent variables and the dependent variable. Only 4.3% of the variance in "RATE" is explained by the model.
Adj. R-squared: 0.043 - Adjusted R-squared is similar to R-squared but accounts for the number of independent variables, providing a slightly more reliable estimate of the model's explanatory power.
F-statistic: 1490. (highly significant with p-value < 0.00) - This statistic tests the null hypothesis that all regression coefficients are zero (no relationship between race and rate). The very high F-statistic and low p-value strongly reject the null hypothesis, indicating a statistically significant relationship between race and cancer rates.
Prob (F-statistic): 0.00 - This confirms the significance of the F-statistic.
No. Observations: 131640 - This is the sample size used for the analysis.
Coefficient Estimates:

The table shows the estimated coefficients for each independent variable (RACE categories) and the intercept.
Intercept: 19.2589 - This represents the predicted average cancer rate when all RACE categories are zero (which might not be a meaningful value in reality).
RACE Coefficients: Each coefficient represents the difference in average cancer rate compared to the reference category (likely "Non-Hispanic White").
Non-Hispanic American Indian/Alaska Native: 20.2190 higher average rate.
Non-Hispanic Asian/Pacific Islander: 3.6173 higher average rate.
Non-Hispanic Black: 1.4015 higher average rate.
Non-Hispanic White: This category is used as the reference, so its coefficient is -2.5872 (shown as zero but interpreted as no difference compared to itself). This implies Non-Hispanic Whites have a lower average cancer rate compared to the other categories in the model.
Diagnostics:

Omnibus, Prob(Omnibus), Jarque-Bera (JB), Prob(JB): These tests assess normality of the error terms (residuals) in the model. The highly significant p-values (0.00) indicate the residuals are not normally distributed. This might be a concern if normality assumptions are crucial for the chosen model.
Skew: 1.523, Kurtosis: 4.653 - These values suggest the residuals are positively skewed and have a higher kurtosis (more peaked) than a normal distribution.
Cond. No.: 6.04 - This condition number indicates there might be some multicollinearity among the independent variables. However, the value is not excessively high, so it might not be a major concern.
Notes:

The note clarifies that the standard errors (shown in the coefficient table) assume a correctly specified covariance matrix for the errors.
Overall Interpretation:

While the model statistically demonstrates a relationship between race and cancer rates, the R-squared value is low, suggesting other factors significantly influence cancer rates. The diagnostics highlight potential issues with normality and multicollinearity. Here are some additional considerations:

Explore alternative model specifications or transformations to improve normality and address multicollinearity (if necessary).
Investigate the role of other factors that could influence cancer rates.
Consider visualizations like boxplots to explore the relationship between race and cancer rates in more detail.
Remember that statistical significance doesn't necessarily imply causality. Other factors might explain the observed associations between race and cancer rates.

In [None]:
import pandas as pd
from statsmodels.formula.api import ols

def calculate_eta_squared_r_squared(formula, data):
  """
  Calculates eta-squared from R-squared and Adjusted R-squared (linear regression).

  Args:
      formula (str): The formula for the linear regression model.
      data (pandas.DataFrame): The data used for the regression.

  Returns:
      float: The estimated eta-squared value.
  """
  # Fit the linear regression model
  model = ols(formula, data=data).fit()

  # Get R-squared and adjusted R-squared
  r_squared = model.rsquared
  adjusted_r_squared = model.rsquared_adj

  # Calculate eta-squared using adjusted R-squared (more reliable)
  eta_squared = adjusted_r_squared / (1 - adjusted_r_squared)

  return eta_squared

# Example usage (replace with your actual formula and data)
formula = "RATE ~ RACE"
data = get_dataframe(op_path=op_path)

eta_squared = calculate_eta_squared_r_squared(formula, data)

print("Eta-squared (adjusted R-squared):", eta_squared)


The calculated Eta-squared value of 0.0452 (using adjusted R-squared) indicates a small effect size. Here's a breakdown of the interpretation:

Effect size: Eta-squared values closer to 0 signify a smaller effect. In this case, the model explains only about 4.52% of the variance in the dependent variable ("RATE") based on the independent variable(s) (likely "RACE").
Interpretation: This suggests that racial categories alone don't explain a substantial portion of the variation in cancer rates. There are likely other significant factors influencing cancer risk.
Considerations:

Eta-squared is sensitive to sample size. A larger sample size can inflate the value.
It doesn't tell you the direction of the effect (positive or negative relationship between race and cancer rates).

Chi Square Method

Anova Method

ETA-Squared Method

Linear Regression and Coefficients

EDA (Viz)