In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
test = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/test.csv')
train = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/training.csv')

# In this notebook I'll implement NLP on the description column of the dataset.
First I drop the columns that contain too many NaN's 

In [None]:
train.drop(columns = ['metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type', 'patient_race', 'payer_type' , 'bmi', 'female', 'male'], inplace= True)
test.drop(columns = ['metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type', 'patient_race', 'payer_type' , 'bmi', 'female', 'male'], inplace= True)

# I concatenate train and test dataset so I can perform tokenization on all of the data

In [None]:
whole_df = pd.concat([train.drop(columns=['DiagPeriodL90D']), test])

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk


# I split the words in the description and lowercase them, I remove stopwords and replace some of shortened words

In [None]:

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Convert to lower case and remove stop words
whole_df['processed_description'] = whole_df['breast_cancer_diagnosis_desc'].apply(lambda x: ' '.join(word for word in x.lower().split() if word not in stop_words))

# Define your custom mapping
mapping_dict = {
    'malig': 'malignant',
    'unsp': 'unspecified',
    'ovrlp': 'overlapping',
    'neoplm' : 'neoplasm',
    
}

# Function to replace words based on the mapping
def replace_words(text):
    return ' '.join(mapping_dict.get(word, word) for word in text.split())

# Apply the function to the 'processed_description' column
whole_df['processed_description'] = whole_df['processed_description'].apply(replace_words)

# Now you can continue with the tokenization as before
whole_df['tokenized_description'] = whole_df['processed_description'].apply(nltk.word_tokenize)

In [None]:
whole_df['tokenized_description']

In [None]:

# Join the tokens back into a single string
whole_df['tokenized_description'] = whole_df['tokenized_description'].str.join(' ')

# Initialize the TfidfVectorizer
tfidf = TfidfVectorizer()

# Fit and transform the 'tokenized_description' column
tfidf_matrix = tfidf.fit_transform(whole_df['tokenized_description'])

# Convert the matrix into a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Now tfidf_df is a DataFrame where each word is a separate column and the value is its TF-IDF score

In [None]:
tfidf_df.head()

In [None]:
whole_df.reset_index(inplace=True)

# I combine the tokenized dataframe with my original dataframe

In [None]:
tokenized_df = pd.concat([whole_df, tfidf_df], axis = 1)

In [None]:
tokenized_df_types = tokenized_df.dtypes.to_dict()

for key in tokenized_df_types:
    if tokenized_df_types[key] =='O':
        tokenized_df[key] = tokenized_df[key].astype('category')

In [None]:
tokenized_df['patient_zip3']=tokenized_df['patient_zip3'].astype('category')
tokenized_df.dtypes

In [None]:
num_col = tokenized_df.select_dtypes(exclude=['category']).columns
cat_col = tokenized_df.select_dtypes(include=['category']).columns


In [None]:
print(cat_col)

In [None]:
print(num_col)

# In my previous notebooks I used OneHotEncoding for categorical data, this time I want to check the OrdinalEncoder, since I saw somebody use it. I was curious about the results, since the categorical data are not of 

In [None]:
from sklearn.preprocessing import OrdinalEncoder


# Initialize the encoder
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
# Loop through each categorical column
for col in cat_col:
    # Fit the encoder on the training data
    encoder.fit(tokenized_df[[col]])

    # Transform both training and test data
    tokenized_df[col] = encoder.transform(tokenized_df[[col]])

# I split the dataframe again in train and test sets

In [None]:
train_df = tokenized_df[:12906]
test_df = tokenized_df[12906:]

In [None]:
train_df['DiagPeriodL90D']=train['DiagPeriodL90D'].astype('category')
train_df.dtypes

In [None]:
train_df.columns

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
X = train_df.drop(columns=['index', 'patient_id','DiagPeriodL90D', 'patient_gender' ])
y = le.fit_transform(train_df['DiagPeriodL90D'])

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, stratify=y, shuffle=True, random_state = 2137)

# I set parameters for the estimators in my ensemble

In [None]:
params_xgb = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 1],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8],
        'max_depth': [3, 4, 5],
        'learning_rate' : [0.001, 0.01,  0.05],
        'n_estimators' : [500, 1000, 2000],
        'scale_pos_weight' : [0.6]
        }

params_lgb = {
        'bagging_fraction': [0.5, 0.8],
        'bagging_freq': [3, 5, 8],
        'feature_fraction': [0.5, 0.8],
        'max_depth': [8, 10, 13],
        'min_data_in_leaf': [60, 90, 120],
        'num_leaves': [100, 1200, 1550]   
}

params_cat = {
        'iterations': [100, 200, 400],
        'learning_rate': [0.01, 0.1 , 0.5],
        'depth': [4, 5, 6],    
        'min_data_in_leaf' : [50, 100, 150, 200]
}

params_ada = {
        #'max_depth': [3, 4, 5],
        'learning_rate' : [0.001, 0.01, 0.02, 0.05],
        'n_estimators' : [500, 1000, 1500]
    
}

In [None]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle = True, random_state = 2137)

In [None]:
tune_xgb = xgb.XGBClassifier(objective='binary:logistic', enable_categorical=True)
rand_xgb = RandomizedSearchCV(tune_xgb, param_distributions=params_xgb, n_iter=40, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=1, random_state=2137)
rand_xgb.fit(X_train, y_train)
rand_xgb_pred = rand_xgb.predict_proba(X_test)

In [None]:
print(roc_auc_score(y_test, rand_xgb_pred[:, 1]))
print(confusion_matrix(y_test, np.round(rand_xgb_pred[:, 1])))

In [None]:
best = rand_xgb.best_estimator_
best_features = best.feature_importances_
for idx, feat in enumerate(X.columns):
    print(feat, best_features[idx])

In [None]:
tune_lgb = lgb.LGBMClassifier()
rand_lgb = RandomizedSearchCV(tune_lgb, param_distributions=params_lgb, n_iter=40, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=1, random_state = 2137 )
rand_lgb.fit(X_train, y_train)
rand_lgb_pred = rand_lgb.predict_proba(X_test)
print(roc_auc_score(y_test, rand_lgb_pred[:, 1]))
print(confusion_matrix(y_test, np.round(rand_lgb_pred[:, 1])))

In [None]:
tune_cat = cb.CatBoostClassifier()
rand_cat = RandomizedSearchCV(tune_cat, param_distributions=params_cat, n_iter=40, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=1, random_state = 2137 )
rand_cat.fit(X_train, y_train)
rand_cat_pred = rand_cat.predict_proba(X_test)
print(roc_auc_score(y_test, rand_cat_pred[:, 1]))
print(confusion_matrix(y_test, np.round(rand_cat_pred[:, 1])))

In [None]:
#tune_ada = AdaBoostClassifier()
#rand_ada = RandomizedSearchCV(tune_ada, param_distributions=params_ada, n_iter=40, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=1, random_state = 2137 )
#rand_ada.fit(X_train, y_train)
#rand_ada_pred = rand_cat.predict_proba(X_test)
#print(roc_auc_score(y_test, rand_ada_pred[:, 1]))
#print(confusion_matrix(y_test, np.round(rand_ada_pred[:, 1])))

# I use 3 estimators (xgb, lgb and catboost) and xgb as my meta estimator

In [None]:
clf_xgb = xgb.XGBClassifier(**rand_xgb.best_params_,enable_categorical =  True, objective = 'binary:logistic')
clf_lgb = lgb.LGBMClassifier(**rand_lgb.best_params_)
clf_cat = cb.CatBoostClassifier(**rand_cat.best_params_)
#clf_ada = AdaBoostClassifier(**rand_ada.best_params_)
clf_lr = LogisticRegression(class_weight = 'balanced', random_state = 2137)

In [None]:
clf_voting = VotingClassifier(
    estimators=[
        ('xgb', clf_xgb),
        ('lgb', clf_lgb),
        ('cat', clf_cat)],
    voting = 'soft',
    verbose=False
    
)

In [None]:
clf_voting.fit(X_train, y_train)
y_pred = clf_voting.predict_proba(X_test)

In [None]:
print(roc_auc_score(y_test, y_pred[:, 1]))

In [None]:
sub_proba = clf_voting.predict_proba(test_df.drop(columns=['patient_gender','index', 'patient_id']))
test['DiagPeriodL90D'] = sub_proba[:, 1]
sub = test[['patient_id','DiagPeriodL90D']]
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)