In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, QuantileTransformer, RobustScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
training = pd.read_csv('train.csv')
testing = pd.read_csv('test.csv')

In [None]:
print("Size of training dataset:",training.shape)
print("Size of testing dataset:",testing.shape)

In [None]:
print('Training dataset:\n', training.describe(include='all'))

In [None]:
print('Testing dataset:\n', testing.describe(include='all'))

In [None]:
plt.figure()
sns.boxplot(x=training['KetebalanKulit'])
plt.show()

plt.figure()
sns.boxplot(x=training['Insulin'])
plt.show()

plt.figure()
sns.boxplot(x=training['Glukosa'])
plt.show()

plt.figure()
sns.boxplot(x=training['TekananDarah'])
plt.show()

plt.figure()
sns.boxplot(x=training['IMT'])
plt.show()

In [None]:
plt.figure()
sns.boxplot(x=testing['KetebalanKulit'])
plt.show()

plt.figure()
sns.boxplot(x=testing['Insulin'])
plt.show()

plt.figure()
sns.boxplot(x=testing['Glukosa'])
plt.show()

plt.figure()
sns.boxplot(x=testing['TekananDarah'])
plt.show()

plt.figure()
sns.boxplot(x=testing['IMT'])
plt.show()

In [None]:
# Fill the zero value in training dataset with MEDIAN
training['Glukosa'] = training['Glukosa'].replace([0], training['Glukosa'].median())
training['TekananDarah'] = training['TekananDarah'].replace([0], training['TekananDarah'].median())
training['KetebalanKulit'] = training['KetebalanKulit'].replace([0], training['KetebalanKulit'].median())
training['Insulin'] = training['Insulin'].replace([0], training['Insulin'].median())
training['IMT'] = training['IMT'].replace([0], training['IMT'].median())

In [None]:
# Fill the zero value in testing dataset with MEDIAN
testing['Glukosa'] = testing['Glukosa'].replace([0], testing['Glukosa'].median())
testing['TekananDarah'] = testing['TekananDarah'].replace([0], testing['TekananDarah'].median())
testing['KetebalanKulit'] = testing['KetebalanKulit'].replace([0], testing['KetebalanKulit'].median())
testing['Insulin'] = testing['Insulin'].replace([0], testing['Insulin'].median())
testing['IMT'] = testing['IMT'].replace([0], testing['IMT'].median())

In [None]:
Q1 = training[['Glukosa','TekananDarah','KetebalanKulit','Insulin']].quantile(0.25)
Q3 = training[['Glukosa','TekananDarah','KetebalanKulit','Insulin']].quantile(0.75)

IQR = Q3 - Q1

max_val = Q3 + (1.5 * IQR)
min_val = Q1 - (1.5 * IQR)

upper_bound = (training > max_val)
lower_bound = (training < min_val)

train_clean = training.mask(upper_bound, max_val, axis=1)
train_clean = train_clean.mask(lower_bound, min_val, axis=1)

In [None]:
plt.figure()
sns.boxplot(x=train_clean['KetebalanKulit'])
plt.show()

plt.figure()
sns.boxplot(x=train_clean['Insulin'])
plt.show()

In [None]:
plt.figure(figsize=(20, 30))
for i, cols in enumerate(train_clean.columns):
    ax = plt.subplot(9, 3, i+1)
    sns.kdeplot(x=train_clean[cols])
plt.show()

In [None]:
print('Size of training dataset after removing outliers:', train_clean.shape)

In [None]:
plt.figure(figsize=(13, 8))
sns.heatmap(train_clean.corr(method='spearman'), annot=True)
plt.show()

In [None]:
X_training = train_clean.drop('Target', axis=1)
y_training = train_clean['Target']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_training,
                                                    y_training,
                                                    test_size=0.2,
                                                    random_state=0)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.get_params()

In [None]:
grid = GridSearchCV(dtc, param_grid=dict(
                                        ccp_alpha=np.linspace(0, 10, 10),
                                        criterion=['gini', 'entropy']))
pipe = Pipeline([('scaler', StandardScaler()),
                 ('cv', grid)])
model = pipe.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier

rfc = RandomForestClassifier(n_jobs=3)
rfc.get_params()

In [None]:
grid = GridSearchCV(rfc, param_grid=dict(
                                        ccp_alpha=np.linspace(0, 10, 10),
                                        criterion=['gini', 'entropy'],
                                        n_jobs=[1, 2, 3]),
                                        n_jobs=-1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('cv', grid)])
model = pipe.fit(X_train, y_train)

In [None]:
gradient = GradientBoostingClassifier()
gradient.get_params()

In [None]:
grid = GridSearchCV(gradient, param_grid=dict(
                                        ccp_alpha=np.linspace(0, 10, 10),
                                        criterion=['friedman_mse', 'mse', 'mae'],
                                        learning_rate=[1.0]),
                                        n_jobs=-1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('cv', grid)])
model = pipe.fit(X_train, y_train)

In [None]:
ada_boost = AdaBoostClassifier()
ada_boost.get_params()

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', ada_boost)])
model = pipe.fit(X_train, y_train)

In [None]:
ex_tree = ExtraTreesClassifier()
ex_tree.get_params()

In [None]:
grid = GridSearchCV(ex_tree, param_grid=dict(
                                            ccp_alpha=np.linspace(1, 10, 10),
                                            criterion=['gini', 'entropy'],
                                            n_estimators=[10, 100, 1000]
), n_jobs=-1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('cv', grid)])
model = pipe.fit(X_train, y_train)

In [None]:
predict = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(predict, y_test)
print("Model score: {}".format(acc))

In [None]:
finale_pred = model.predict(testing.drop('index', axis=1))

In [None]:
hasil = pd.DataFrame()
hasil['index'] = testing['index']
hasil['Target'] = finale_pred

In [None]:
hasil.to_csv("HASIL.csv",  index=False)