In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# pandas options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

| Variable Name                 | Description                                                               |
|-------------------------------|---------------------------------------------------------------------------|
| avganncount                   | Average number of cancer cases diagnosed annually.                       |
| avgdeathsperyear              | Average number of deaths due to cancer per year.                         |
| target_deathrate              | Target death rate due to cancer.                                         |
| incidencerate                 | Incidence rate of cancer.                                                |
| medincome                     | Median income in the region.                                             |
| popest2015                    | Estimated population in 2015.                                            |
| povertypercent                | Percentage of population below the poverty line.                         |
| studypercap                   | Per capita number of cancer-related clinical trials conducted.           |
| binnedinc                     | Binned median income.                                                    |
| medianage                     | Median age in the region.                                                |
| pctprivatecoveragealone       | Percentage of population covered by private health insurance alone.      |
| pctempprivcoverage            | Percentage of population covered by employee-provided private health insurance. |
| pctpubliccoverage             | Percentage of population covered by public health insurance.             |
| pctpubliccoveragealone        | Percentage of population covered by public health insurance only.        |
| pctwhite                      | Percentage of White population.                                          |
| pctblack                      | Percentage of Black population.                                          |
| pctasian                      | Percentage of Asian population.                                          |
| pctotherrace                  | Percentage of population belonging to other races.                       |
| pctmarriedhouseholds          | Percentage of married households.                                        |
| birthrate                     | Birth rate in the region.                                                |


In [5]:
# load data
cancer_stats = pd.read_csv('data/cancer_reg_stats.csv')
regional_stats = pd.read_csv('data/cancer_avg-household-size.csv')
# join on geography
cancer_stats = cancer_stats.merge(regional_stats, on='geography', how='inner')

# these columns have many missing values
# drop pctsomecol18_24 column
cancer_stats.drop(columns=['pctsomecol18_24'], inplace=True)
# drop pctprivatecoveragealone column
cancer_stats.drop(columns=['pctprivatecoveragealone'], inplace=True)
# drop rows where pctemployed16_over is missing
cancer_stats = cancer_stats[~cancer_stats['pctemployed16_over'].isnull()]

In [6]:
statefips_dict = {
    1: "AL",  # Alabama
    2: "AK",  # Alaska
    4: "AZ",  # Arizona
    5: "AR",  # Arkansas
    6: "CA",  # California
    8: "CO",  # Colorado
    9: "CT",  # Connecticut
    10: "DE",  # Delaware
    11: "DC",  # District of Columbia
    12: "FL",  # Florida
    13: "GA",  # Georgia
    15: "HI",  # Hawaii
    16: "ID",  # Idaho
    17: "IL",  # Illinois
    18: "IN",  # Indiana
    19: "IA",  # Iowa
    20: "KS",  # Kansas
    21: "KY",  # Kentucky
    22: "LA",  # Louisiana
    23: "ME",  # Maine
    24: "MD",  # Maryland
    25: "MA",  # Massachusetts
    26: "MI",  # Michigan
    27: "MN",  # Minnesota
    28: "MS",  # Mississippi
    29: "MO",  # Missouri
    30: "MT",  # Montana
    31: "NE",  # Nebraska
    32: "NV",  # Nevada
    33: "NH",  # New Hampshire
    34: "NJ",  # New Jersey
    35: "NM",  # New Mexico
    36: "NY",  # New York
    37: "NC",  # North Carolina
    38: "ND",  # North Dakota
    39: "OH",  # Ohio
    40: "OK",  # Oklahoma
    41: "OR",  # Oregon
    42: "PA",  # Pennsylvania
    44: "RI",  # Rhode Island
    45: "SC",  # South Carolina
    46: "SD",  # South Dakota
    47: "TN",  # Tennessee
    48: "TX",  # Texas
    49: "UT",  # Utah
    50: "VT",  # Vermont
    51: "VA",  # Virginia
    53: "WA",  # Washington
    54: "WV",  # West Virginia
    55: "WI",  # Wisconsin
    56: "WY",  # Wyoming
}

In [None]:
cancer_stats.describe()

Unnamed: 0,avganncount,avgdeathsperyear,target_deathrate,incidencerate,medincome,popest2015,povertypercent,studypercap,medianage,medianagemale,medianagefemale,percentmarried,pctnohs18_24,pcths18_24,pctbachdeg18_24,pcths25_over,pctbachdeg25_over,pctemployed16_over,pctunemployed16_over,pctprivatecoverage,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate,statefips,countyfips,avghouseholdsize
count,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0,2895.0
mean,611.23772,187.461485,178.638964,447.96813,47001.894301,103678.6,16.91544,157.692163,45.104387,39.598135,42.183282,51.77019,18.23171,35.032919,6.173713,34.8162,13.243592,54.152642,7.85772,64.305216,41.155302,36.313955,19.273506,83.610678,9.133871,1.260685,2.005906,51.226933,5.636896,30.26563,101.901554,2.528235
std,1442.306706,513.45378,27.464991,52.737861,12047.178175,336204.1,6.425357,541.188736,44.495505,5.231846,5.276953,6.878744,8.101923,9.09973,4.557746,7.017115,5.359881,8.315064,3.451343,10.645841,9.439102,7.837938,6.116058,16.352919,14.555143,2.64829,3.568495,6.522579,2.00147,15.072375,104.545806,0.245812
min,6.0,3.0,59.7,201.3,22640.0,827.0,3.2,0.0,22.3,22.4,22.3,23.1,0.0,0.0,0.0,7.5,2.5,17.6,0.4,22.3,13.5,11.2,2.6,10.199155,0.0,0.0,0.0,22.99249,0.0,1.0,1.0,1.86
25%,75.5,28.0,161.3,420.3,38751.0,11652.5,12.15,0.0,37.7,36.3,39.1,47.75,12.8,29.2,3.1,30.5,9.4,48.6,5.5,57.15,34.4,30.8,14.9,77.211147,0.633417,0.249003,0.296176,47.790762,4.514261,18.0,35.0,2.38
50%,171.0,61.0,178.1,453.549422,45132.0,26521.0,15.9,0.0,41.0,39.6,42.4,52.4,17.2,34.7,5.3,35.3,12.3,54.5,7.6,65.0,41.0,36.4,18.8,90.02888,2.313188,0.545589,0.830184,51.67364,5.382482,29.0,79.0,2.5
75%,515.0,148.0,195.2,480.5,52488.0,68140.5,20.5,81.759464,44.0,42.6,45.4,56.4,22.7,40.7,8.2,39.6,16.1,60.3,9.7,72.0,47.6,41.7,23.1,95.364834,10.658813,1.221638,2.204819,55.335684,6.493677,45.0,133.0,2.64
max,38150.0,14010.0,293.9,1014.2,125635.0,10170290.0,47.4,9762.308998,624.0,64.7,65.7,72.5,62.7,72.5,51.8,54.8,42.2,80.1,29.4,92.3,70.7,65.1,46.6,100.0,85.947799,42.619425,41.930251,71.703057,21.326165,56.0,840.0,3.97


In [7]:
cancer_stats_reduced = cancer_stats.drop(columns=['geography', 'binnedinc']) # drop non-numeric columns
# # combine statefips and countyfips to get a unique identifier for each county
# cancer_stats_reduced['regionfips'] = cancer_stats_reduced['statefips'].astype(str) + cancer_stats_reduced['countyfips'].astype(str)
# cancer_stats_reduced['regionfips'] = cancer_stats_reduced['regionfips'].astype(int)
# cancer_stats_reduced.drop(columns=['statefips', 'countyfips'], inplace=True)
cancer_stats_reduced

Unnamed: 0,avganncount,avgdeathsperyear,target_deathrate,incidencerate,medincome,popest2015,povertypercent,studypercap,medianage,medianagemale,medianagefemale,percentmarried,pctnohs18_24,pcths18_24,pctbachdeg18_24,pcths25_over,pctbachdeg25_over,pctemployed16_over,pctunemployed16_over,pctprivatecoverage,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate,statefips,countyfips,avghouseholdsize
0,1397.000000,469,164.9,489.800000,61898,260131,11.2,499.748204,39.3,36.9,41.7,52.5,11.5,39.5,6.9,23.2,19.6,51.9,8.0,75.1,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831,53,35,2.54
1,173.000000,70,161.3,411.600000,48127,43269,18.6,23.111234,33.0,32.2,33.7,44.5,6.1,22.4,7.5,26.0,22.7,55.9,7.8,70.2,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.372500,4.333096,53,37,2.34
2,102.000000,50,174.7,349.700000,49348,21026,14.6,47.560164,45.0,44.0,45.8,54.2,24.0,36.6,9.5,29.0,16.0,45.9,7.0,63.7,34.9,42.1,21.1,90.922190,0.739673,0.465898,2.747358,54.444868,3.729488,53,39,2.62
3,427.000000,202,194.8,430.400000,44243,75882,17.1,342.637253,42.8,42.2,43.4,52.7,20.2,41.2,2.5,31.6,9.3,48.3,12.1,58.4,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841,53,41,2.52
4,57.000000,26,144.4,350.100000,49955,10321,12.5,0.000000,48.3,47.8,48.9,57.8,14.9,43.0,2.0,33.4,15.0,48.2,4.8,61.6,35.1,44.0,22.7,94.104024,0.270192,0.665830,0.492135,54.027460,6.796657,53,43,2.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3041,1962.667684,48,146.4,453.549422,49508,29029,13.0,723.414516,32.2,31.0,33.8,45.1,2.9,17.7,9.8,27.7,21.9,68.6,4.5,80.0,52.9,24.3,9.5,94.291726,1.145104,1.386542,1.014038,43.276779,3.977583,20,51,2.35
3042,1962.667684,15,149.6,453.549422,46961,6343,12.4,0.000000,44.2,41.1,48.8,51.0,20.9,27.9,8.4,32.2,15.2,51.7,4.3,78.3,44.6,31.7,13.2,90.280811,3.837754,0.327613,1.700468,51.063830,7.773512,20,53,2.08
3043,1962.667684,43,150.1,453.549422,48609,37118,18.8,377.175494,30.4,29.3,31.4,52.6,26.7,33.9,3.8,23.1,12.4,70.1,4.6,64.5,48.6,28.8,17.7,75.706245,2.326771,4.044920,14.130288,52.007937,8.186470,20,55,2.90
3044,1962.667684,46,153.9,453.549422,51144,34536,15.0,1968.959926,30.9,30.5,31.2,54.8,19.7,44.5,2.5,23.0,12.8,64.8,6.4,62.0,47.8,26.6,16.8,87.961629,2.313188,1.316472,5.680705,55.153949,7.809192,20,57,3.04


In [None]:
# correlation matrix
corr = cancer_stats_reduced.corr()
# plt.figure(figsize=(20, 20))
# sns.heatmap(corr, annot=True, fmt=".2f")
# plt.show()

In [None]:
# we want to predict target variable 'target_deathrate'
# let's see how it is correlated with other variables
corr_target = corr['target_deathrate']
corr_target = corr_target.sort_values(ascending=False)
corr_target

target_deathrate          1.000000
pctpubliccoveragealone    0.448087
incidencerate             0.428787
povertypercent            0.425083
pcths25_over              0.403842
pctpubliccoverage         0.402141
pctunemployed16_over      0.373750
pcths18_24                0.261246
pctblack                  0.252774
pctnohs18_24              0.080009
countyfips                0.078749
medianagefemale           0.010088
medianage                -0.001208
studypercap              -0.020402
medianagemale            -0.025385
statefips                -0.033173
avghouseholdsize         -0.038687
birthrate                -0.088058
avgdeathsperyear         -0.091365
popest2015               -0.119804
avganncount              -0.144066
pctwhite                 -0.165104
pctasian                 -0.186464
pctotherrace             -0.189770
percentmarried           -0.257639
pctempprivcoverage       -0.264430
pctbachdeg18_24          -0.285941
pctmarriedhouseholds     -0.286233
pctprivatecoverage  

In [8]:
# we now create our feature matrix X and target vector y
X = cancer_stats_reduced.drop(columns=['target_deathrate', 'incidencerate', 'avganncount', 'avgdeathsperyear'])
y = cancer_stats_reduced['target_deathrate']

# split data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

col_names = X.columns

# scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=col_names)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=col_names)

# linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2: {r2}')

Mean Squared Error: 433.6866027447603
R^2: 0.4360547976740947


In [15]:
# xgboost
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train_scaled, y_train)
y_pred = xgb.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2: {r2}')

Mean Squared Error: 434.75545051067013
R^2: 0.434664919439935


In [3]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt

# import statsmodels.api as sm
# from statsmodels.stats.outliers_influence \
#      import variance_inflation_factor as VIF
# from statsmodels.stats.anova import anova_lm
# from ISLP.models import (ModelSpec as MS,
#                          summarize,
#                          poly)

# X_X = MS(X.columns).fit_transform(X)
# y_y = y
# model = sm.OLS(y_y, X_X)
# results = model.fit()
# summarize(results)

ModuleNotFoundError: No module named 'ISLP'

In [None]:
# # create a list of all predictors with p < 0.05
# significant_predictors = results.pvalues[results.pvalues < 0.05].index.tolist()
# significant_predictors = significant_predictors[1:] # remove intercept
# significant_predictors

['medincome',
 'medianagemale',
 'medianagefemale',
 'percentmarried',
 'pctnohs18_24',
 'pcths18_24',
 'pcths25_over',
 'pctbachdeg25_over',
 'pctemployed16_over',
 'pctunemployed16_over',
 'pctempprivcoverage',
 'pctpubliccoveragealone',
 'pctotherrace',
 'pctmarriedhouseholds',
 'birthrate',
 'countyfips',
 'avghouseholdsize']

In [None]:
# # keep only significant predictors
# X = X[significant_predictors]

# # split data into training and test sets
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# col_names = X.columns

# # scale data
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# X_train_scaled = pd.DataFrame(X_train_scaled, columns=col_names)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=col_names)

In [9]:
# random forest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred = rf.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2: {r2}')

Mean Squared Error: 407.1180223108808
R^2: 0.47060330199373224


In [10]:
# feature importance df 
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
feature_importance

Unnamed: 0,feature,importance
12,pctbachdeg25_over,0.208708
18,pctpubliccoveragealone,0.068313
11,pcths25_over,0.065582
22,pctotherrace,0.062846
20,pctblack,0.060488
1,popest2015,0.035587
23,pctmarriedhouseholds,0.03541
0,medincome,0.032879
2,povertypercent,0.032756
27,avghouseholdsize,0.03057


In [None]:
# import pandas as pd
# import json
# import folium

# # Load the us-states.json file
# with open('data/us-states.json') as f:
#     us_states = json.load(f)

# # Map the statefips codes to the corresponding states in the cancer_stats DataFrame
# cancer_stats['state'] = cancer_stats['statefips'].map(statefips_dict)

# # Create a choropleth map
# m = folium.Map(location=[48, -102], zoom_start=3,
#                zoom_control=False,
#                scrollWheelZoom=False,
#                dragging=False)
# # List of metrics
# metrics = ['avganncount', 'avgdeathsperyear', 'target_deathrate', 'incidencerate', 'medincome', 'popest2015', 'povertypercent', 'studypercap', 'medianage', 'pctprivatecoveragealone', 'pctempprivcoverage', 'pctpubliccoverage', 'pctpubliccoveragealone', 'pctwhite', 'pctblack', 'pctasian', 'pctotherrace', 'pctmarriedhouseholds', 'birthrate']

# # For each metric, create a Choropleth layer and add it to the Map object
# for i, metric in enumerate(metrics):
#     choropleth = folium.Choropleth(
#         geo_data=us_states,
#         name=metric,
#         data=cancer_stats,
#         columns=['state', metric],
#         key_on='feature.id',
#         fill_color='YlGn',
#         fill_opacity=0.7,
#         line_opacity=0.2,
#         legend_name=metric,
#         show=(i==0)  # Only show the first layer
#     ).add_to(m)

# # Add a LayerControl object to the Map object
# folium.LayerControl().add_to(m)

# # limit the map to the US
# m.fit_bounds([[49, -67], [24, -125]])

# m

In [12]:
# auto sklearn
import autosklearn.regression
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    tmp_folder='output/autosklearn_regression_example_tmp',
)
# get best models
automl.fit(X_train_scaled, y_train)
y_pred = automl.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2: {r2}')

# get the final ensemble
print(automl.show_models())

Mean Squared Error: 384.3969480086714
R^2: 0.5001486943653812
{3: {'model_id': 3, 'rank': 1, 'cost': 0.6827396964421949, 'ensemble_weight': 0.2, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x76e7bee85130>, 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x76e7bef23ac0>, 'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x76e7bef232e0>, 'sklearn_regressor': SVR(C=194.03096694114694, cache_size=1897.2161458333333,
    epsilon=0.0010214279074797082, gamma=0.20113065159176252,
    tol=0.0206281932709369, verbose=0)}, 4: {'model_id': 4, 'rank': 2, 'cost': 0.6788554380793582, 'ensemble_weight': 0.04, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x76e7bee82ee0>, 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object a