In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/equity-in-healthcare-clean-datasets/train_clean.csv
/kaggle/input/equity-in-healthcare-clean-datasets/test_clean.csv
/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv
/kaggle/input/widsdatathon2024-challenge1/training.csv
/kaggle/input/widsdatathon2024-challenge1/test.csv


# This notebook makes use of the clean and wrangled version of the original dataset. You can find the data sets here:
https://www.kaggle.com/datasets/anopsy/equity-in-healthcare-clean-datasets

# If you like the notebook don't hestitate to give it a thumb up. ALso any suggestions on how to handle tokenization in a better and more informative way or how to perform Stacking Ensemble are welcome! Happy coding!

In [8]:
train = pd.read_csv('/kaggle/input/equity-in-healthcare-clean-datasets/train_clean.csv')
test = pd.read_csv('/kaggle/input/equity-in-healthcare-clean-datasets/test_clean.csv')

In [9]:
for col in train.columns:
    print(col)

index
patient_id
patient_state
patient_zip3
patient_age
breast_cancer_diagnosis_code
breast_cancer_diagnosis_desc
metastatic_cancer_diagnosis_code
Region
Division
population
density
age_median
age_under_10
age_10_to_19
age_20s
age_30s
age_40s
age_50s
age_60s
age_70s
age_over_80
male
female
married
divorced
never_married
widowed
family_size
family_dual_income
income_household_median
income_household_under_5
income_household_5_to_10
income_household_10_to_15
income_household_15_to_20
income_household_20_to_25
income_household_25_to_35
income_household_35_to_50
income_household_50_to_75
income_household_75_to_100
income_household_100_to_150
income_household_150_over
income_household_six_figure
income_individual_median
home_ownership
housing_units
home_value
rent_median
rent_burden
education_less_highschool
education_highschool
education_some_college
education_bachelors
education_graduate
education_college_or_above
education_stem_degree
labor_force_participation
unemployment_rate
self_empl

In [10]:
train.drop(columns = ['female', 'male'], inplace= True)
test.drop(columns = ['female', 'male'], inplace= True)

# Let's combine the dataframes to further work on the features

In [11]:
whole_df = pd.concat([train.drop(columns=['DiagPeriodL90D']), test])

# I'll start with tokenizing the 'breast_cancer_diagnosis_desc' column. I'll will use nltk and TfidVectorizer. That's my first iteration, so I 'll keep it simple, but I'm consdiering using n-grams later on

In [12]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk


# I'm replacing some of the shorter words and can proceed with toneizing and changing the tokens into vectors

In [13]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
whole_df['processed_description'] = whole_df['breast_cancer_diagnosis_desc'].apply(lambda x: ' '.join(word for word in x.lower().split() if word not in stop_words))


mapping_dict = {
    'malig': 'malignant',
    'unsp': 'unspecified',
    'ovrlp': 'overlapping',
    'neoplm' : 'neoplasm',
    
}


def replace_words(text):
    return ' '.join(mapping_dict.get(word, word) for word in text.split())


whole_df['processed_description'] = whole_df['processed_description'].apply(replace_words)
whole_df['tokenized_description'] = whole_df['processed_description'].apply(nltk.word_tokenize)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
whole_df['tokenized_description']

0       [malignant, neoplasm, unspecified, site, unspe...
1       [malignant, neoplasm, upper-outer, quadrant, r...
2       [malignant, neoplasm, central, portion, left, ...
3       [malignant, neoplasm, upper-inner, quadrant, l...
4       [malignant, neoplasm, breast, (, female, ), ,,...
                              ...                        
5787    [malignant, neoplasm, upper-outer, quadrant, r...
5788    [malignant, neoplasm, unspecified, site, left,...
5789    [malignant, neoplasm, upper-outer, quadrant, r...
5790    [malignant, neoplasm, breast, (, female, ), ,,...
5791    [malignant, neoplasm, central, portion, right,...
Name: tokenized_description, Length: 18698, dtype: object

In [15]:
whole_df['tokenized_description'] = whole_df['tokenized_description'].str.join(' ')


tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(whole_df['tokenized_description'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Now I can look at my dataframe with tokens and combine it with the other features.

In [16]:
tfidf_df.head()

Unnamed: 0,areola,axillary,breast,central,female,inner,left,lower,male,malignant,...,portion,quadrant,right,secondary,site,sites,specified,tail,unspecified,upper
0,0.0,0.0,0.236507,0.0,0.236913,0.0,0.0,0.0,0.0,0.236495,...,0.0,0.0,0.0,0.0,0.46011,0.0,0.0,0.0,0.75125,0.0
1,0.0,0.0,0.196357,0.0,0.196693,0.0,0.0,0.0,0.0,0.196346,...,0.0,0.436085,0.416451,0.0,0.0,0.0,0.0,0.0,0.0,0.483763
2,0.0,0.0,0.150693,0.633317,0.150952,0.0,0.326976,0.0,0.0,0.150685,...,0.633317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.176346,0.0,0.176648,0.622059,0.382636,0.0,0.0,0.176336,...,0.0,0.391643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.434462
4,0.0,0.0,0.391461,0.0,0.392131,0.0,0.0,0.0,0.0,0.39144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.621724,0.0


In [17]:
whole_df.reset_index(inplace=True)

In [18]:
tokenized_df = pd.concat([whole_df, tfidf_df], axis = 1)

# I remove the intermediate columns I created in the process

In [19]:
tokenized_df.drop(columns=['processed_description', 'tokenized_description'], inplace=True)

# It's time to get the dtypes right.

In [20]:
tokenized_df_types = tokenized_df.dtypes.to_dict()

for key in tokenized_df_types:
    if tokenized_df_types[key] =='O':
        tokenized_df[key] = tokenized_df[key].astype('category')

In [21]:
tokenized_df['patient_zip3']=tokenized_df['patient_zip3'].astype('category')
tokenized_df.dtypes

level_0             int64
index               int64
patient_id          int64
patient_state    category
patient_zip3     category
                   ...   
sites             float64
specified         float64
tail              float64
unspecified       float64
upper             float64
Length: 114, dtype: object

In [22]:
num_col = tokenized_df.select_dtypes(exclude=['category']).columns
cat_col = tokenized_df.select_dtypes(include=['category']).columns


In [23]:
tokenized_df.head()

Unnamed: 0,level_0,index,patient_id,patient_state,patient_zip3,patient_age,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,metastatic_cancer_diagnosis_code,Region,...,portion,quadrant,right,secondary,site,sites,specified,tail,unspecified,upper
0,0,0,475714,CA,924,84,C50919,Malignant neoplasm of unsp site of unspecified...,C7989,West,...,0.0,0.0,0.0,0.0,0.46011,0.0,0.0,0.0,0.75125,0.0
1,1,1,349367,CA,928,62,C50411,Malig neoplm of upper-outer quadrant of right ...,C773,West,...,0.0,0.436085,0.416451,0.0,0.0,0.0,0.0,0.0,0.0,0.483763
2,2,2,138632,TX,760,43,C50112,Malignant neoplasm of central portion of left ...,C773,South,...,0.633317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,617843,CA,926,45,C50212,Malig neoplasm of upper-inner quadrant of left...,C773,West,...,0.0,0.391643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.434462
4,4,4,817482,ID,836,55,1749,"Malignant neoplasm of breast (female), unspeci...",C773,West,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.621724,0.0


In [26]:

train_new = tokenized_df[:12906]
test_new = tokenized_df[12906:]

In [27]:
train_new['DiagPeriodL90D'] = train['DiagPeriodL90D']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_new['DiagPeriodL90D'] = train['DiagPeriodL90D']


In [28]:
train_new.to_csv('train_clean.csv', index = False)
test_new.to_csv('test_clean.csv', index = False)

In [None]:
print(cat_col)

In [None]:
for col in tokenized_df.columns:
    print(col)

# I noticed in my previous notebooks, that I get better results when I strip the data frame from some of the demographic data columns. This is the selection of features I want to use in the current notebook.

In [None]:
num_col = ['patient_age',
 'population',
 'density',
 'age_median',
 'income_household_median',
 'income_individual_median',
 'home_ownership',
 'housing_units',
 'home_value',
 'race_white',
 'bmi',
 'young_ind',
 'desolated',
 'home_wealth',
 'air_quality',
 'wealth_index',
 'education_ratio',
 'hh_income_ratio',
 'age_ratio',
 'race_ration', 'areola',
 'axillary', 'breast','central','female','inner','left','lower','male','malignant','neoplasm','nipple',
 'outer','overlapping','portion','quadrant','right','secondary','site','sites',
 'specified','tail','unspecified','upper']

# Now I can scale my data, I use for this purpose the StandardScaler since the data seems not to contain any outliers.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

for col in num_col:

    scaler.fit(tokenized_df[[col]])
    tokenized_df[col] = scaler.transform(tokenized_df[[col]])

In [None]:
tokenized_df[num_col]

# I also create dummies for my categorical columns. I tried before Oridnal Encoder, but it seems that onehotencoding works a tiny bit better.

In [None]:
dummy_df = pd.concat([tokenized_df[num_col], pd.get_dummies(tokenized_df[cat_col], dtype=int)], axis = 1)

# After the initial runs LGBM gave mee some errors due to "special JSON" characters in the columns descriptions. That's why I use regex to remove them from the column names.

In [None]:
import re
dummy_df = dummy_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# I split my dataframes back into two separate train and test df's.

In [None]:
train_df = dummy_df[:12906]
test_df = dummy_df[12906:]

In [None]:
train_df['DiagPeriodL90D']=train['DiagPeriodL90D'].astype('category')
train_df.dtypes

# I will tune hyperparameters of 4 classifiers. And then stack them. You can also find code for voting, but the results were the same. I also added Logistic Regression so the number of clf's is odd.

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
X = train_df.drop(columns=['DiagPeriodL90D' ])
y = le.fit_transform(train_df['DiagPeriodL90D'])

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, stratify=y, shuffle=True, random_state = 2137)

# This is my parameters search space. I had hard time finding out some suitable parameters ranges especially for catBoost and adaBoost. So if you have any suggestions please let  me know.

In [None]:
params_xgb = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 1],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8],
        'max_depth': [3, 4, 5],
        'learning_rate' : [0.001, 0.01,  0.05],
        'n_estimators' : [2000]
        }

params_lgb = {
        'bagging_fraction': [0.2, 0.5, 0.8],
        'bagging_freq': [3, 5, 8],
        'objective' :['binary'],
        'metric': ['AUC'],
        'feature_fraction': [0.5, 0.8],
        'max_depth': [8, 10, 13],
        'min_data_in_leaf': [40, 60],
        'num_leaves': [100, 200, 500],
        'sample_pos_weight' : [0.6],
        'num_iterations' : [100, 200, 500]
}

params_cat = {
        'iterations': [100, 200, 400],
        'learning_rate': [0.01, 0.1 , 0.5],
        'depth': [4, 5, 6],    
        'min_data_in_leaf' : [50, 100, 150, 200]
    
}

params_ada = {

        'learning_rate' : [0.001, 0.01, 0.02, 0.05],
        'n_estimators' : [20, 500, 1500]
    
}

In [None]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle = True, random_state = 2137)

In [None]:
tune_xgb = xgb.XGBClassifier(objective='binary:logistic', enable_categorical=True)
rand_xgb = RandomizedSearchCV(tune_xgb, param_distributions=params_xgb, n_iter=60, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=1, random_state=2137)
rand_xgb.fit(X_train, y_train)
rand_xgb_pred = rand_xgb.predict_proba(X_test)

In [None]:
print(roc_auc_score(y_test, rand_xgb_pred[:, 1]))
print(confusion_matrix(y_test, np.round(rand_xgb_pred[:, 1])))

In [None]:
print(rand_xgb.best_params_)

In [None]:
best = rand_xgb.best_estimator_
best_features = best.feature_importances_
for idx, feat in enumerate(X.columns):
    print(feat, best_features[idx])

In [None]:
tune_lgb = lgb.LGBMClassifier()
rand_lgb = RandomizedSearchCV(tune_lgb, param_distributions=params_lgb, n_iter=40, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=1, random_state = 2137 )
rand_lgb.fit(X_train, y_train)
rand_lgb_pred = rand_lgb.predict_proba(X_test)
print(roc_auc_score(y_test, rand_lgb_pred[:, 1]))
print(confusion_matrix(y_test, np.round(rand_lgb_pred[:, 1])))

In [None]:
tune_cat = cb.CatBoostClassifier()
rand_cat = RandomizedSearchCV(tune_cat, param_distributions=params_cat, n_iter=40, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=1, random_state = 2137 )
rand_cat.fit(X_train, y_train)
rand_cat_pred = rand_cat.predict_proba(X_test)
print(roc_auc_score(y_test, rand_cat_pred[:, 1]))
print(confusion_matrix(y_test, np.round(rand_cat_pred[:, 1])))

In [None]:
tune_ada = AdaBoostClassifier()
rand_ada = RandomizedSearchCV(tune_ada, param_distributions=params_ada, n_iter=40, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=1, random_state = 2137 )
rand_ada.fit(X_train, y_train)
rand_ada_pred = rand_cat.predict_proba(X_test)
print(roc_auc_score(y_test, rand_ada_pred[:, 1]))
print(confusion_matrix(y_test, np.round(rand_ada_pred[:, 1])))

# I use best params to create classifiers for my Ensemble

In [None]:
clf_xgb = xgb.XGBClassifier(**rand_xgb.best_params_,enable_categorical =  True, objective = 'binary:logistic')
clf_lgb = lgb.LGBMClassifier(**rand_lgb.best_params_)
clf_cat = cb.CatBoostClassifier(**rand_cat.best_params_)
clf_ada = AdaBoostClassifier(**rand_ada.best_params_)
clf_lr = LogisticRegression(class_weight = 'balanced', random_state = 2137)

# And if you're interested in trying Voting Ensemble here you can find the code for it.

In [None]:
#clf_voting = VotingClassifier(
#    estimators=[
#        ('xgb', clf_xgb),
#        ('lgb', clf_lgb),
#        ('lr', clf_lr),
#        ('cat', clf_cat),
#        ('ada', clf_ada)],
#    voting = 'soft',
#    verbose=False
#   
#)

In [None]:
#clf_voting.fit(X_train, y_train)
#y_pred = clf_voting.predict_proba(X_test)

In [None]:
#print(roc_auc_score(y_test, y_pred[:, 1]))

In [None]:
#test_df.columns

In [None]:
#sub_proba = clf_voting.predict_proba(test_df)



In [None]:
#sub = test[['patient_id']]
#sub['DiagPeriodL90D'] = sub_proba[:, 1]

#sub.head()


In [None]:
#sub.to_csv('submission.csv', index=False)

In [None]:
classifiers=[('xgb', clf_xgb),
        ('lgb', clf_lgb),
        ('lr', clf_lr),
        ('cat', clf_cat),
        ('ada', clf_ada)]

In [None]:
clf_meta = xgb.XGBClassifier(**rand_xgb.best_params_, enable_categorical =  True, objective = 'binary:logistic', random_state=2137 )

# I tried both passthrough=False and passthrough=True and the latter one seems to work a bit better, but yeah it takes some time.

In [None]:
clf_stack= StackingClassifier(
    estimators = classifiers,
    final_estimator = clf_meta,
    cv =3, 
    stack_method = 'predict_proba',
    passthrough = True, 
    verbose = 3) 

In [None]:
clf_stack.fit(X_train, y_train)

# The scores on the X_test set don't differ that much from the scores of single classifiers.

In [None]:
stack_pred = clf_stack.predict_proba(X_test)
print(roc_auc_score(y_test, stack_pred[:, 1]))

In [None]:
print(confusion_matrix(y_test, np.round(stack_pred[:, 1])))

In [None]:
test_df.columns

In [None]:
stack_proba = clf_stack.predict_proba(test_df)


In [None]:
sub_stack = test[['patient_id']]
sub_stack['DiagPeriodL90D'] = stack_proba[:, 1]
sub_stack.head()

In [None]:
sub_stack.to_csv('submission.csv', index = False)

# As we can see also in confusion matrix there is still a group of missclassified patients. It's usually around 400. In my next notebook I'll try to perform residual analysis, maybe it will help to get that number down.