In [1]:
import numpy as np
import pandas as pd


**Data Preparation**

In [2]:
data = pd.read_csv("/content/drive/MyDrive/Healthkathon_2022/sampling_healtkathon2022/sampling_healtkathon2022.csv")
data.head()

Unnamed: 0,id,id_peserta,dati2,typefaskes,usia,jenkel,pisat,tgldatang,tglpulang,jenispel,...,biaya,jenispulang,cbg,kelasrawat,kdsa,kdsp,kdsr,kdsi,kdsd,label
0,165666,486,17,KL,48,P,1.0,2018-07-25T17:00:00.000Z,2018-07-25T17:00:00.000Z,2,...,184300.0,1.0,Q-5-42-0,3,,,,,,0
1,1010828,520,17,A,63,L,1.0,2019-05-27T17:00:00.000Z,2019-05-30T17:00:00.000Z,1,...,10628400.0,1.0,D-4-13-III,1,,,,,,0
2,166042,523,17,KL,53,P,1.0,2019-07-16T17:00:00.000Z,2019-07-16T17:00:00.000Z,2,...,187300.0,1.0,Q-5-44-0,3,,,,,,0
3,168937,549,17,KL,54,P,1.0,2019-10-17T17:00:00.000Z,2019-10-17T17:00:00.000Z,2,...,187300.0,1.0,Q-5-44-0,3,,,,,,0
4,1005899,549,17,A,53,P,1.0,2018-04-18T17:00:00.000Z,2018-04-18T17:00:00.000Z,2,...,381600.0,1.0,Q-5-44-0,3,,,,,,0


In [None]:
data.shape

In [None]:
diagnosa = pd.read_csv("/content/drive/MyDrive/Healthkathon_2022/sampling_healthkathon2022_diagnosa/sampling_healthkathon2022_diagnosa.csv")
diagnosa.head()

In [None]:
diagnosa.shape

In [None]:
proc = pd.read_csv("/content/drive/MyDrive/Healthkathon_2022/sampling_healthkathon2022_procedure/sampling_healthkathon2022_procedure.csv")
proc.head()

In [None]:
proc.shape

**Data Prepocessing**

In [None]:
#Menghitung jumlah hari pasien menginap di rumah sakit
from datetime import datetime
data["tgldatang"] = pd.to_datetime(data["tgldatang"])
data["tglpulang"] = pd.to_datetime(data["tglpulang"])

In [None]:
lama =data["tglpulang"].dt.date - data["tgldatang"].dt.date
lama = lama.dt.days
delta = pd.DataFrame(lama, columns = ["lama"])

In [None]:
data1 = data.drop(columns = ["label", "tgldatang", "tglpulang"])
data1 = pd.concat([data1, delta], axis = 1)

In [None]:
#Menghitung jumlah diagnosa yang diberikan kepada pasien
diag = diagnosa.pivot_table(columns=['id'], aggfunc='size')
diag = diag.reset_index()
diag.columns = ["id", "jml_diag"]


In [None]:
df = pd.merge(data1, diag, on = "id", how = "left")
df.shape

In [None]:
#Menghitung jumlah prosedure yang diberikan kepada pasien
proc = proc.pivot_table(columns=['id'], aggfunc='size')
proc = proc.reset_index()
proc.columns = ["id", "jml_procc"]

In [None]:
proc = proc.drop_duplicates(subset = ["id"])
proc.shape

In [None]:
df = pd.merge(df, proc, on = "id", how = "left")
df.shape

In [None]:
df.isna().sum()

In [None]:
# Memfilter data yang kosong
df["jenkel"] = df["jenkel"].fillna("p")
df["pisat"] = df["pisat"].fillna(0)
df["politujuan"] = df["politujuan"].fillna("tidak")
df["biaya"] = df["biaya"].fillna(237150.0)
df["jenispulang"] = df["jenispulang"].fillna(1.0)
df["kdsa"] = df["kdsa"].fillna("None")
df["kdsp"] = df["kdsp"].fillna("None")
df["kdsr"] = df["kdsr"].fillna("None")
df["kdsi"] = df["kdsi"].fillna("None")
df["kdsd"] = df["kdsd"].fillna("None")
df["jml_diag"] = df["jml_diag"].fillna(0)
df["diagfktp"] = df["diagfktp"].fillna("P03.4")
df["jml_procc"] = df["jml_procc"].fillna(0)

In [None]:
df.info()

In [None]:
# Mengkonversi atribut tipe data string ke tipe data float
df_str = df[["typefaskes", "jenkel", "politujuan", "diagfktp", "cbg","kdsa","kdsp", "kdsr", "kdsi", "kdsd"]]
df_str = df_str.apply(lambda x: pd.factorize(x)[0])
df_str.head()

In [None]:
#Memisahkan data float dan int
df_int = df[["dati2", "usia", "jenispel","pisat", "biaya", "jenispulang", "kelasrawat", "lama",  "jml_diag"]]
df_int.head()

In [None]:
data_pre = pd.concat([df_int, df_str], axis = 1)
data_pre.head()

In [None]:
label = data[["label"]]
df = pd.concat([data_pre, label], axis = 1)
df.shape

In [None]:
atribut = df.drop(columns = ["label"])
label = df["label"]

In [None]:
# Melakukan standarisasi data
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
scaler = StandardScaler()
scaler.fit(atribut)
atribut1 = scaler.transform(atribut)
atribut1 = pd.DataFrame(atribut1, index = atribut.index, columns = atribut.columns)
atribut1.head()

**Data Processing**

In [None]:
# Melakukan seleksi atribut dan label
x_train = atribut1
y_train = label

In [None]:
#Melakukan Pemodelan data
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
model=tree.DecisionTreeClassifier(random_state=0, max_depth=None, min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0,max_leaf_nodes=None,min_impurity_decrease=0)
clf = model.fit(atribut1, label)

In [None]:
pred = model.predict(atribut1)
len(pred)

In [None]:
hasil = pd.DataFrame(pred, columns = ["label"])
hasil.head()

**Data Evaluation**

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score 

In [None]:
# Melihat akurasi serta kevalidasian data
cm1 =confusion_matrix(hasil ,label)
print("confusion matrix")
print(cm1)
akurasi1=classification_report(hasil ,label)
print("tingkat akurasi algoritma Naive Bayes")
print("Akurasi :", akurasi1)
akurasi1=accuracy_score(hasil ,label) 
print("Tingkat Akurasi :%d persen"%(akurasi1*100))

**Data Prediction**

Merupakan tahap prediksi data terhadap data yang sudah diberi pada tahap 2 kompetisi

In [None]:
data2 = pd.read_csv("/content/drive/MyDrive/Healthkathon_2022/Healthkathon 2022 - Machine Learning Tahap 2/sampling2_healthkathon2022_sep.csv")
data2.head()

In [None]:
data2.shape

In [None]:
diag2 = pd.read_csv("/content/drive/MyDrive/Healthkathon_2022/Healthkathon 2022 - Machine Learning Tahap 2/sampling2_healthkathon2022_diagnosa.csv")
diag2.head()

In [None]:
proc2 = pd.read_csv("/content/drive/MyDrive/Healthkathon_2022/Healthkathon 2022 - Machine Learning Tahap 2/sampling2_healthkathon_2022_procedure.csv")
proc2.head()

In [None]:
from datetime import datetime
data2["tgldatang"] = pd.to_datetime(data2["tgldatang"])
data2["tglpulang"] = pd.to_datetime(data2["tglpulang"])

In [None]:
lama =data2["tglpulang"].dt.date - data2["tgldatang"].dt.date
lama = lama.dt.days
delta = pd.DataFrame(lama, columns = ["lama"])

In [None]:
data2 = data2.drop(columns = ["tgldatang", "tglpulang"])
data2 = pd.concat([data2, delta], axis = 1)

In [None]:
data2.head()

In [None]:
#Menghitung jumlah diagnosa yang diberikan kepada pasien
diag2 = diag2.pivot_table(columns=['id'], aggfunc='size')
diag2 = diag2.reset_index()
diag2.columns = ["id", "jml_diag"]


In [None]:
df2 = pd.merge(data2, diag2, on = "id", how = "left")
df2.shape

In [None]:
#Menghitung jumlah prosedure yang diberikan kepada pasien
proc2 = proc2.pivot_table(columns=['id'], aggfunc='size')
proc2 = proc2.reset_index()
proc2.columns = ["id", "jml_procc"]

In [None]:
proc2 = proc2.drop_duplicates(subset = ["id"])
proc2.shape

In [None]:
df2 = pd.merge(df2, proc2, on = "id", how = "left")
df2.shape

In [None]:
df2.isna().sum()

In [None]:
df2["jenkel"] = df2["jenkel"].fillna("p")
df2["pisat"] = df2["pisat"].fillna(0)
df2["politujuan"] = df2["politujuan"].fillna("tidak")
df2["diagfktp"] = df2["diagfktp"].fillna("P03.4")
df2["biaya"] = df2["biaya"].fillna(237150.0)
df2["kdsa"] = df2["kdsa"].fillna("None")
df2["kdsp"] = df2["kdsp"].fillna("None")
df2["kdsr"] = df2["kdsr"].fillna("None")
df2["kdsi"] = df2["kdsi"].fillna("None")
df2["kdsd"] = df2["kdsd"].fillna("None")
df2["jml_diag"] = df2["jml_diag"].fillna(0)
df2["jml_procc"] = df2["jml_procc"].fillna(0)
df2["jenispulang"] = df2["jenispulang"].fillna(1.0)

In [None]:
df_str2 = df2[["typefaskes", "jenkel", "politujuan", "diagfktp", "cbg","kdsa","kdsp", "kdsr", "kdsi", "kdsd"]]
df_str2 = df_str2.apply(lambda x: pd.factorize(x)[0])
df_str2.head()

In [None]:
df_int2 = df2[["dati2", "usia", "jenispel","pisat", "biaya", "jenispulang", "kelasrawat", "lama",  "jml_diag", "jml_procc"]]
df_int2.head()

In [None]:
data_pre2 = pd.concat([df_int2, df_str2], axis = 1)
data_pre2.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
scaler = StandardScaler()
scaler.fit(data_pre2)
atribut2 = scaler.transform(data_pre2)
atribut2 = pd.DataFrame(atribut2, index = data_pre2.index, columns = data_pre2.columns)
atribut2.head()

In [None]:
atribut2 = atribut2[["dati2", "usia", "jenispel", "pisat", "biaya", "jenispulang", "kelasrawat", "lama", "jml_diag", "typefaskes", "jenkel", "politujuan", "diagfktp", "cbg", "kdsa", "kdsp", "kdsr", "kdsi", "kdsd", "jml_procc"]]
atribut2.shape

In [None]:
pred2 = model.predict(atribut2)
len(pred2)

In [None]:
hasil2 = pd.DataFrame(pred2, columns = ["label"])
hasil2.head()

In [None]:
result2 = pd.concat([data2["id"], hasil2], axis = 1)
result2.shape

In [None]:
result2.head()

In [None]:
from google.colab import drive
drive.mount('drive')
result2.to_csv('/content/drive/MyDrive/Healthkathon_2022/Healthkathon 2022 - Machine Learning Tahap 2/answer.csv', encoding='utf-8', index=False)