## **Import Depedencies**

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split

In [None]:
!pip install catboost
!pip install lightgbm
!pip install mlextend

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from mlxtend.classifier import EnsembleVoteClassifier

Dataset

In [5]:
'''input the dataset here (.csv)'''

fraud_final = pd.read_csv('fraud_detection_train.csv')

In [6]:
fraud_final.shape

(200217, 53)

## **Feature Engineering**

In [7]:
secondary_list = list(fraud_final.columns[11:33])
procedure_list = list(fraud_final.columns[33:52])

### New Feature

In [8]:
#@title Encode Binning
'''
Change value if more than 0.
'''
def encode_biner(x):
  if x != 0:
    return 1
  else:
    return 0

In [9]:
'''binary encoding secondary diagnose and sum of procedure'''

# secondary diagnose
for cols in secondary_list:
  fraud_final[f'{cols}'] = fraud_final[f'{cols}'].apply(encode_biner)

# procedure
for cols in procedure_list:
  fraud_final[f'{cols}'] = fraud_final[f'{cols}'].apply(encode_biner)

In [10]:
'''sum of secondary diagnose and sum of procedure'''

fraud_final['total_secondary'] = fraud_final[secondary_list].sum(axis=1)
fraud_final['total_procedure'] = fraud_final[procedure_list].sum(axis=1)

In [11]:
#@title dati2_kdkc
def concat_dati2_kdkc(row):
  row["dati2_kdkc"] = str(row["dati2"]) + "_" + str(row["kdkc"])
  return row

def apply_row1(row):
  row = concat_dati2_kdkc(row)
  return row

In [12]:
fraud_final = fraud_final.apply(lambda x: apply_row1(x), axis=1)

In [13]:
#@title diag_sev
def concat_diag_sev(row):
  row["diag_sev"] = str(row["diagprimer"]) + "_" + str(row["severitylevel"])
  return row

def apply_row2(row):
  row = concat_diag_sev(row)
  return row

In [14]:
fraud_final = fraud_final.apply(lambda x: apply_row2(x), axis=1)

In [15]:
#@title sev_los
def concat_sev_los(row):
  row["sev_los"] = str(row["severitylevel"]) + "_" + str(row["los"])
  return row

def apply_row3(row):
  row = concat_sev_los(row)
  return row

In [16]:
fraud_final = fraud_final.apply(lambda x: apply_row3(x), axis=1)

In [17]:
'''log los'''

fraud_final.drop(fraud_final[(fraud_final.los > 40)].index, inplace=True)
fraud_final['los_log'] = np.log(fraud_final.los + 2)

### Change Data Types

In [18]:
fraud_final['dati2'] = fraud_final['dati2'].astype(object)
fraud_final['severitylevel'] = fraud_final['severitylevel'].astype(object)

fraud_final['los_log'] = fraud_final['los_log'].astype(int)

In [19]:
'''binary encoding secondary diagnose and sum of procedure'''

# secondary diagnose
for cols in secondary_list:
  fraud_final[f'{cols}'] = fraud_final[f'{cols}'].astype(bool)

# procedure
for cols in procedure_list:
  fraud_final[f'{cols}'] = fraud_final[f'{cols}'].astype(bool)

### Drop Features

In [56]:
# fraud_final.drop(secondary_list, axis=1, inplace=True)
# fraud_final.drop(procedure_list, axis=1, inplace=True)

In [20]:
'''add more columns to drop if necessary'''

to_drop = ['visit_id', 'jnspelsep', 'los', 'jkpst', 'umur', 'kdkc']

fraud_final.drop(to_drop, axis=1, inplace=True)

In [21]:
fraud_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199980 entries, 0 to 200216
Data columns (total 53 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   dati2            199980 non-null  object
 1   typeppk          199980 non-null  object
 2   cmg              199980 non-null  object
 3   severitylevel    199980 non-null  object
 4   diagprimer       199980 non-null  object
 5   dx2_a00_b99      199980 non-null  bool  
 6   dx2_c00_d48      199980 non-null  bool  
 7   dx2_d50_d89      199980 non-null  bool  
 8   dx2_e00_e90      199980 non-null  bool  
 9   dx2_f00_f99      199980 non-null  bool  
 10  dx2_g00_g99      199980 non-null  bool  
 11  dx2_h00_h59      199980 non-null  bool  
 12  dx2_h60_h95      199980 non-null  bool  
 13  dx2_i00_i99      199980 non-null  bool  
 14  dx2_j00_j99      199980 non-null  bool  
 15  dx2_koo_k93      199980 non-null  bool  
 16  dx2_l00_l99      199980 non-null  bool  
 17  dx2_m00_m9

In [22]:
fraud_final.to_csv('grr.csv', index=False)

## **Model Splitting**

In [60]:
'''uncomment if used pre-cleaned dataset'''

# fraud_final = pd.read_csv('training.csv')

In [23]:
#@title Splitting Function
train_size =  0.8#@param {type:"number"}
rand_state =  10#@param {type:"number"}
def train_val_test_split(X, y):
    """Split dataset into data splits."""
    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=train_size, stratify=y, random_state=rand_state)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_, random_state=rand_state)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [24]:
X_final = fraud_final.drop("label", axis=1)
y_final = fraud_final.label.copy()

In [25]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X_final, y_final)

In [26]:
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_val: {X_test.shape}, y_val: {y_test.shape}")

X_train: (159984, 52), y_train: (159984,)
X_val: (19998, 52), y_val: (19998,)
X_val: (19998, 52), y_val: (19998,)


## **Modeling**

### Base Model

#### CatBoost

In [36]:
cat_features = [index for index, column in enumerate(X_final.columns) if X_final[column].dtype == "object"]

In [37]:
params = {
    "loss_function": "CrossEntropy",
    "eval_metric": "F1",
    "random_seed": 10,
    "task_type": "GPU",
    "cat_features": cat_features,
    "early_stopping_rounds": 200,
    "iterations": 2000,
    "verbose": 100,
    "max_depth": 6,
    "learning_rate": 0.10,
}

In [38]:
cat = CatBoostClassifier(**params)

In [39]:
cat.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6305692	test: 0.6691287	best: 0.6691287 (0)	total: 79.7ms	remaining: 2m 39s
100:	learn: 0.7150253	test: 0.7318851	best: 0.7319896 (94)	total: 6.09s	remaining: 1m 54s
200:	learn: 0.7201205	test: 0.7355559	best: 0.7358842 (195)	total: 12s	remaining: 1m 47s
300:	learn: 0.7239337	test: 0.7362903	best: 0.7367626 (289)	total: 17.9s	remaining: 1m 40s
400:	learn: 0.7262392	test: 0.7368847	best: 0.7370492 (323)	total: 23.7s	remaining: 1m 34s
500:	learn: 0.7283180	test: 0.7357927	best: 0.7370492 (323)	total: 29.5s	remaining: 1m 28s
bestTest = 0.7370491803
bestIteration = 323
Shrink model to first 324 iterations.


<catboost.core.CatBoostClassifier at 0x7f95520040d0>

In [40]:
cat_pred = cat.predict(X_test)
print(classification_report(y_test, cat_pred, labels=[0, 1], digits=5))

              precision    recall  f1-score   support

           0    0.73158   0.74529   0.73837      9980
           1    0.74143   0.72759   0.73445     10018

    accuracy                        0.73642     19998
   macro avg    0.73651   0.73644   0.73641     19998
weighted avg    0.73652   0.73642   0.73641     19998



In [74]:
from sklearn.model_selection import cross_validate

scoring = ['precision', 'recall', 'accuracy']
cv = StratifiedKFold(n_splits=5, random_state=10)
score = cross_validate(cat, X_final, y_final, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=False)



In [75]:
avg_precision = score["test_precision"].mean()
avg_recall = score["test_recall"].mean()
avg_acc = score["test_accuracy"].mean()

print(f'Precision avg: {avg_precision}')
print(f'Recall avg: {avg_recall}')
print(f'Acc avg: {avg_acc}')

print(f'Possible score : {(avg_precision*0.4)+(avg_recall*0.3)+(avg_acc*0.3)}')

Precision avg: 0.7411903475319476
Recall avg: 0.7300785790248909
Acc avg: 0.7370737073707371
Possible score : 0.7366218249314674


#### LightGBM

In [33]:
X_lg = X_final.copy()
y_lg = y_final.copy()

catfeat = [column for column in list(X_lg.columns) if X_lg[column].dtype == "object"]

for col in catfeat:
  X_lg[col] = X_lg[col].astype('category')

X_train_lg, X_val_lg, X_test_lg, y_train_lg, y_val_lg, y_test_lg = train_val_test_split(X_lg, y_lg)

In [41]:
params = {
    "loss_function": "CrossEntropy",
    "eval_metric": "F1",
    "random_seed": 10,
    "categorical_feature": catfeat,
    "verbose": 100,
    "max_depth": 6,
    "learning_rate": 0.10,
    "num_iterations": 2000,
    "objective": 'binary'
}

In [None]:
lgb = LGBMClassifier(**params)
lgb.fit(X_train_lg, y_train_lg, eval_set=(X_val_lg, y_val_lg), feature_name='auto', categorical_feature='auto')

In [43]:
lgb_pred = lgb.predict(X_test_lg)
print(classification_report(y_test_lg, lgb_pred, labels=[0, 1], digits=5))

              precision    recall  f1-score   support

           0    0.73874   0.74088   0.73981      9980
           1    0.74112   0.73897   0.74004     10018

    accuracy                        0.73992     19998
   macro avg    0.73993   0.73993   0.73992     19998
weighted avg    0.73993   0.73992   0.73992     19998



In [71]:
from sklearn.model_selection import cross_validate

scoring = ['precision', 'recall', 'accuracy']
cv = StratifiedKFold(n_splits=5, random_state=10)
score = cross_validate(lgb, X_lg, y_lg, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=False)



In [72]:
avg_precision = score["test_precision"].mean()
avg_recall = score["test_recall"].mean()
avg_acc = score["test_accuracy"].mean()

print(f'Precision avg: {avg_precision}')
print(f'Recall avg: {avg_recall}')
print(f'Acc avg: {avg_acc}')

print(f'Possible score : {(avg_precision*0.4)+(avg_recall*0.3)+(avg_acc*0.3)}')

Precision avg: 0.7351023504613979
Recall avg: 0.7428255546165528
Acc avg: 0.737068706870687
Possible score : 0.7380092186307311


### Voting

In [45]:
ensemble = EnsembleVoteClassifier(
    clfs=[cat, lgb],
    weights=[1.,1.5],
    voting='soft',
)

In [46]:
ensemble.fit(X_train_lg, y_train_lg)

0:	learn: 0.6305692	total: 72.4ms	remaining: 2m 24s
100:	learn: 0.7150253	total: 5.69s	remaining: 1m 47s
200:	learn: 0.7201125	total: 11s	remaining: 1m 38s
300:	learn: 0.7239508	total: 16.3s	remaining: 1m 31s
400:	learn: 0.7262312	total: 21.5s	remaining: 1m 25s
500:	learn: 0.7283180	total: 26.7s	remaining: 1m 19s
600:	learn: 0.7297431	total: 32s	remaining: 1m 14s
700:	learn: 0.7313349	total: 37.2s	remaining: 1m 8s
800:	learn: 0.7329001	total: 42.4s	remaining: 1m 3s
900:	learn: 0.7344385	total: 47.6s	remaining: 58s
1000:	learn: 0.7353789	total: 52.7s	remaining: 52.6s
1100:	learn: 0.7362452	total: 57.9s	remaining: 47.3s
1200:	learn: 0.7370377	total: 1m 3s	remaining: 41.9s
1300:	learn: 0.7381450	total: 1m 8s	remaining: 36.6s
1400:	learn: 0.7388627	total: 1m 13s	remaining: 31.3s
1500:	learn: 0.7398884	total: 1m 18s	remaining: 26.1s
1600:	learn: 0.7409328	total: 1m 23s	remaining: 20.8s
1700:	learn: 0.7416948	total: 1m 28s	remaining: 15.6s
1800:	learn: 0.7427954	total: 1m 33s	remaining: 10.4

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


EnsembleVoteClassifier(clfs=[<catboost.core.CatBoostClassifier object at 0x7f95520040d0>,
                             LGBMClassifier(boosting_type='gbdt',
                                            categorical_feature=['dati2',
                                                                 'typeppk',
                                                                 'cmg',
                                                                 'severitylevel',
                                                                 'diagprimer',
                                                                 'dati2_kdkc',
                                                                 'diag_sev',
                                                                 'sev_los'],
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            eval_metric='F1',
                                     

In [47]:
y_ensemble = ensemble.predict(X_test_lg)

In [48]:
print(classification_report(y_test_lg, y_ensemble, labels=[0, 1], digits=5))

              precision    recall  f1-score   support

           0    0.74135   0.74699   0.74416      9980
           1    0.74603   0.74037   0.74319     10018

    accuracy                        0.74367     19998
   macro avg    0.74369   0.74368   0.74367     19998
weighted avg    0.74369   0.74367   0.74367     19998



In [49]:
from sklearn.model_selection import cross_validate

scoring = ['precision', 'recall', 'accuracy']
cv = StratifiedKFold(n_splits=5, random_state=10)
score = cross_validate(ensemble, X_lg, y_lg, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=False)



In [50]:
avg_precision = score["test_precision"].mean()
avg_recall = score["test_recall"].mean()
avg_acc = score["test_accuracy"].mean()

print(f'Precision avg: {avg_precision}')
print(f'Recall avg: {avg_recall}')
print(f'Acc avg: {avg_acc}')

print(f'Possible score : {(avg_precision*0.4)+(avg_recall*0.3)+(avg_acc*0.3)}')

Precision avg: 0.7411928932582599
Recall avg: 0.7436839610744878
Acc avg: 0.7415091509150915
Possible score : 0.7420350909001777


## Submit

In [51]:
validation = pd.read_csv('valbest.csv')

In [52]:
catt = [column for column in list(validation.columns) if validation[column].dtype == "object"]

for col in catt:
  validation[col] = validation[col].astype('category')

In [54]:
validation['dati2'] = validation['dati2'].astype('category')
validation['severitylevel'] = validation['severitylevel'].astype('category')

In [55]:
validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49762 entries, 0 to 49761
Data columns (total 52 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   dati2            49762 non-null  category
 1   typeppk          49762 non-null  category
 2   cmg              49762 non-null  category
 3   severitylevel    49762 non-null  category
 4   diagprimer       49762 non-null  category
 5   dx2_a00_b99      49762 non-null  bool    
 6   dx2_c00_d48      49762 non-null  bool    
 7   dx2_d50_d89      49762 non-null  bool    
 8   dx2_e00_e90      49762 non-null  bool    
 9   dx2_f00_f99      49762 non-null  bool    
 10  dx2_g00_g99      49762 non-null  bool    
 11  dx2_h00_h59      49762 non-null  bool    
 12  dx2_h60_h95      49762 non-null  bool    
 13  dx2_i00_i99      49762 non-null  bool    
 14  dx2_j00_j99      49762 non-null  bool    
 15  dx2_koo_k93      49762 non-null  bool    
 16  dx2_l00_l99      49762 non-null  bool   

In [56]:
validation_id = pd.read_csv('val_id.csv')

In [57]:
val_pred = ensemble.predict(validation)

In [64]:
lgb_predd = lgb.predict(validation)

In [62]:
sub = pd.DataFrame({"visit_id": validation_id.visit_id, "predict_label": val_pred})
sub.head()

Unnamed: 0,visit_id,predict_label
0,1,0
1,2,1
2,3,1
3,4,1
4,5,0


In [63]:
sub.to_csv('SEMOGABAGUS.csv', index=False)

In [70]:
sub.shape

(49762, 2)