In [3]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [5]:
data = pd.read_csv('medicaid-vision-claims.csv')
print(data.head())
print(data.info())
print(data.describe())


   YearStart  YearEnd LocationAbbr  LocationDesc DataSource  \
0       2019     2019           ND  North Dakota   Medicaid   
1       2019     2019           ND  North Dakota   Medicaid   
2       2019     2019           NE      Nebraska   Medicaid   
3       2019     2019           WI     Wisconsin   Medicaid   
4       2019     2019           WY       Wyoming   Medicaid   

                   Topic          Category  \
0  Eye Health Conditions  Cornea Disorders   
1  Eye Health Conditions  Cornea Disorders   
2  Eye Health Conditions  Cornea Disorders   
3  Eye Health Conditions  Cornea Disorders   
4  Eye Health Conditions  Cornea Disorders   

                                          Question     Response  \
0  Annual prevalence of diagnosed cornea disorders  Keratoconus   
1  Annual prevalence of diagnosed cornea disorders  Keratoconus   
2  Annual prevalence of diagnosed cornea disorders  Keratoconus   
3  Annual prevalence of diagnosed cornea disorders  Keratoconus   
4  Annual

In [8]:
# Exclude columns that are not useful or could leak information
exclude_cols = [
    'Data_Value', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
    'Low_Confidence_limit', 'High_Confidence_Limit', 'Numerator',
    'Sample_Size', 'Geolocation'
]

# Define features and target
target = 'Data_Value'
features = [col for col in data.columns if col not in exclude_cols]


In [11]:
# Check for missing values
print(data.isnull().sum())

# Drop rows with missing target values
data = data.dropna(subset=[target])

# Optionally, fill or drop missing feature values
data = data.fillna('Unknown')  # For simplicity, fill missing values with 'Unknown'


YearStart                          0
YearEnd                            0
LocationAbbr                       0
LocationDesc                       0
DataSource                         0
Topic                              0
Category                           0
Question                           0
Response                           0
Age                                0
Gender                             0
RaceEthnicity                      0
RiskFactor                         0
RiskFactorResponse                 0
Data_Value_Unit                    0
Data_Value_Type                    0
Data_Value                    172238
Data_Value_Footnote_Symbol    527762
Data_Value_Footnote           527762
Low_Confidence_limit          172238
High_Confidence_Limit         172238
Numerator                     435273
Sample_Size                   172238
LocationID                         0
TopicID                            0
CategoryID                         0
QuestionID                         0
R

In [13]:
# List of categorical feature names
categorical_features = [
    'LocationAbbr', 'LocationDesc', 'DataSource', 'Topic', 'Category',
    'Question', 'Response', 'Age', 'Gender', 'RaceEthnicity', 'RiskFactor',
    'RiskFactorResponse', 'Data_Value_Unit', 'Data_Value_Type', 'LocationID',
    'TopicID', 'CategoryID', 'QuestionID', 'ResponseID', 'DataValueTypeID',
    'AgeID', 'GenderID', 'RaceEthnicityID', 'RiskFactorID',
    'RiskFactorResponseID', 'Geographic Level'
]

# Ensure all categorical features are of type string
for col in categorical_features:
    data[col] = data[col].astype(str)


In [15]:
# Split into train+valid and test sets
train_valid_data, test_data = train_test_split(
    data, test_size=0.2, random_state=42
)

# Split train_valid_data into training and validation sets
train_data, valid_data = train_test_split(
    train_valid_data, test_size=0.25, random_state=42
)


In [17]:
# Create Pools
train_pool = Pool(
    data=train_data[features],
    label=train_data[target],
    cat_features=categorical_features
)

valid_pool = Pool(
    data=valid_data[features],
    label=valid_data[target],
    cat_features=categorical_features
)

test_pool = Pool(
    data=test_data[features],
    label=test_data[target],
    cat_features=categorical_features
)


In [28]:
# Initialize the model
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.1,
    depth=6,
    eval_metric='RMSE',
    random_seed=42,
    early_stopping_rounds=50,
    verbose=100
)

# Train the model
model.fit(
    train_pool,
    eval_set=valid_pool
)


0:	learn: 1.1591795	test: 1.1726463	best: 1.1726463 (0)	total: 134ms	remaining: 4m 27s
100:	learn: 0.5972258	test: 0.5962188	best: 0.5962188 (100)	total: 9.56s	remaining: 2m 59s
200:	learn: 0.5527863	test: 0.5500570	best: 0.5500570 (200)	total: 20.4s	remaining: 3m 2s
300:	learn: 0.5287587	test: 0.5266849	best: 0.5266849 (300)	total: 31.7s	remaining: 2m 58s
400:	learn: 0.5122409	test: 0.5121201	best: 0.5121201 (400)	total: 43.9s	remaining: 2m 55s
500:	learn: 0.5001092	test: 0.5018438	best: 0.5018438 (500)	total: 56.2s	remaining: 2m 48s
600:	learn: 0.4913421	test: 0.4944024	best: 0.4944024 (600)	total: 1m 8s	remaining: 2m 38s
700:	learn: 0.4829785	test: 0.4884421	best: 0.4884421 (700)	total: 1m 20s	remaining: 2m 28s
800:	learn: 0.4754038	test: 0.4818323	best: 0.4818323 (800)	total: 1m 32s	remaining: 2m 18s
900:	learn: 0.4678089	test: 0.4749700	best: 0.4749700 (900)	total: 1m 44s	remaining: 2m 7s
1000:	learn: 0.4614344	test: 0.4694898	best: 0.4694363 (999)	total: 1m 56s	remaining: 1m 56s


<catboost.core.CatBoostRegressor at 0x3168b1490>

In [30]:
# Predict on the test set
predictions = model.predict(test_pool)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_data[target], predictions))
print(f'Test RMSE: {rmse}')


Test RMSE: 0.4262603731619408


In [26]:
# Get feature importance
feature_importances = model.get_feature_importance(train_pool)
feature_names = features

# Display feature importance
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f'{name}: {score}')


Response: 29.31030866726171
Age: 18.48199419960317
AgeID: 14.900697441779435
ResponseID: 6.08740475803995
QuestionID: 4.989450068400426
LocationAbbr: 4.940145158191024
LocationDesc: 4.15197878662
RaceEthnicity: 2.7978336723114943
LocationID: 2.7083412885295823
Category: 2.661444443120409
CategoryID: 2.376477267120653
RaceEthnicityID: 2.130532253705109
Question: 1.978276720765153
Gender: 1.2652755820502697
GenderID: 1.219839692501692
YearStart: 0.0
YearEnd: 0.0
TopicID: 0.0
Topic: 0.0
RiskFactorResponseID: 0.0
RiskFactorResponse: 0.0
RiskFactorID: 0.0
RiskFactor: 0.0
Geographic Level: 0.0
Data_Value_Unit: 0.0
Data_Value_Type: 0.0
DataValueTypeID: 0.0
DataSource: 0.0
