In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


def find_best_model(X_train, X_test, y_train, y_test):
    X_train = X_train._get_numeric_data()
    X_test = X_test._get_numeric_data()
    y_train = y_train._get_numeric_data()
    y_test = y_test._get_numeric_data()
    # Logistic Regression
    logreg = LogisticRegression(max_iter = 600, random_state = 20)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    logreg_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
    # Decision Tree
    decision_tree = DecisionTreeClassifier(random_state = 42)
    decision_tree.fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    decision_tree_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
    # Random Forest
    random_forest = RandomForestClassifier(random_state = 42)
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_test)
    random_forest_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
    # XGBoost
    xgb = XGBClassifier(random_state = 42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    xgb_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
    # GBM
    gbm = GradientBoostingClassifier(random_state = 42)
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)
    gbm_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
    # LightGBM
    lgbm = LGBMClassifier(random_state = 42)
    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict(X_test)
    lgbm_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
        
    # Catboost
    catb = CatBoostClassifier(verbose = 0, random_state = 42)
    catb.fit(X_train, y_train)
    y_pred = catb.predict(X_test)
    catb_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
    # Histogram-based Gradient Boosting Classification Tree
    hgb = HistGradientBoostingClassifier(random_state = 42)
    hgb.fit(X_train, y_train)
    y_pred = hgb.predict(X_test)
    hgb_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
    model_df = pd.DataFrame({'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'GBM', 'LightGBM', 'Catboost', 'HistBoost'],
                       'Score': [logreg_acc, decision_tree_acc, random_forest_acc, xgb_acc, gbm_acc, lgbm_acc, catb_acc, hgb_acc]})
    print(model_df.sort_values('Score', ascending = False).reset_index(drop = True))
    print()
    print('The Best Model:')
    print(model_df.sort_values('Score', ascending = False).reset_index(drop = True).iloc[0]['Model'])
    return model_df.sort_values('Score', ascending = False).reset_index(drop = True).iloc[0]['Model']



  "Since version 1.0, "


#Heart_data

In [None]:
import pandas as pd
import numpy as np

In [None]:
heart_df = pd.read_csv('heart-all.csv')
heart_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,4,140,260,0,1,112,1,3,2,?,?,2
1,44,1,4,130,209,0,1,127,0,0,?,?,?,0
2,60,1,4,132,218,0,1,140,1,1.5,3,?,?,2
3,55,1,4,142,228,0,1,149,1,2.5,1,?,?,1
4,66,1,3,110,213,1,2,99,1,1.3,2,?,?,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,45,1,1,110,264,0,0,132,0,1.2,2,0,7,1
916,68,1,4,144,193,1,0,141,0,3.4,2,2,7,2
917,57,1,4,130,131,0,0,115,1,1.2,2,1,7,3
918,57,0,2,130,236,0,2,174,0,0,2,1,3,1


In [None]:
def data_check(dataframe):
    print(dataframe.info())
    print()
    print('Data_unique:')
    for i in dataframe.columns:
        if dataframe[i].dtype =='object':
            print(i, pd.unique(dataframe[i]))

In [None]:
data_check(heart_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       920 non-null    int64 
 1   sex       920 non-null    int64 
 2   cp        920 non-null    int64 
 3   trestbps  920 non-null    object
 4   chol      920 non-null    object
 5   fbs       920 non-null    object
 6   restecg   920 non-null    object
 7   thalach   920 non-null    object
 8   exang     920 non-null    object
 9   oldpeak   920 non-null    object
 10  slope     920 non-null    object
 11  ca        920 non-null    object
 12  thal      920 non-null    object
 13  num       920 non-null    int64 
dtypes: int64(4), object(10)
memory usage: 100.8+ KB
None

Data_unique:
trestbps ['140' '130' '132' '142' '110' '120' '150' '180' '160' '126' '?' '128'
 '170' '152' '116' '124' '0' '122' '144' '154' '125' '104' '136' '134'
 '138' '178' '146' '135' '158' '106' '112' '102' '96' '172' '155

In [None]:
heart_df['num']  = heart_df['num'].apply(lambda x: 1 if x>=1 else 0) #'num' is target set 0/1
cp_df = pd.get_dummies(heart_df['cp'], prefix='cp')
slope_df = pd.get_dummies(heart_df['slope'], prefix='slope')
thal_df = pd.get_dummies(heart_df['thal'], prefix='thal')
heart_df = pd.concat([heart_df, cp_df, slope_df, thal_df], axis=1)
heart_df = heart_df.drop(columns=['cp', 'thal', 'slope'])
heart_df.rename(columns={'num':'target'},inplace=True)
heart_df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_3,cp_4,slope_1,slope_2,slope_3,slope_?,thal_3,thal_6,thal_7,thal_?
0,63,1,140,260,0,1,112,1,3,?,...,0,1,0,1,0,0,0,0,0,1
1,44,1,130,209,0,1,127,0,0,?,...,0,1,0,0,0,1,0,0,0,1
2,60,1,132,218,0,1,140,1,1.5,?,...,0,1,0,0,1,0,0,0,0,1
3,55,1,142,228,0,1,149,1,2.5,?,...,0,1,1,0,0,0,0,0,0,1
4,66,1,110,213,1,2,99,1,1.3,?,...,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,45,1,110,264,0,0,132,0,1.2,0,...,0,0,0,1,0,0,0,0,1,0
916,68,1,144,193,1,0,141,0,3.4,2,...,0,1,0,1,0,0,0,0,1,0
917,57,1,130,131,0,0,115,1,1.2,1,...,0,1,0,1,0,0,0,0,1,0
918,57,0,130,236,0,2,174,0,0,1,...,0,0,0,1,0,0,1,0,0,0


In [None]:
#填入中位數
def fillna_median(dataframe, target_column):
    for i in dataframe:
        for x in range(len(dataframe[i])):
            try:
                dataframe[i][x] = float(dataframe[i][x])
            except:
                dataframe[i][x] = np.nan
    #平均 .mean() 中位數 .median() 眾數 .mode())
    for i in dataframe:
        dataframe.loc[target_column==1, i] = dataframe.loc[target_column==1, i].fillna(dataframe.loc[target_column==1, i].median(), inplace=False)
        dataframe.loc[target_column==0, i] = dataframe.loc[target_column==0, i].fillna(dataframe.loc[target_column==0, i].median(), inplace=False)
    return dataframe

In [None]:
#填入平均數
def fillna_mean(dataframe, target_column):
    for i in dataframe:
        for x in range(len(dataframe[i])):
            try:
                dataframe[i][x] = float(dataframe[i][x])
            except:
                dataframe[i][x] = np.nan
    #平均 .mean() 中位數 .median() 眾數 .argmax(np.bincount())
    for i in dataframe:
        dataframe.loc[target_column==1, i] = dataframe.loc[target_column==1, i].fillna(dataframe.loc[target_column==1, i].mean(), inplace=False)
        dataframe.loc[target_column==0, i] = dataframe.loc[target_column==0, i].fillna(dataframe.loc[target_column==0, i].mean(), inplace=False)
    return dataframe

In [None]:
#填入眾數
def fillna_mode(dataframe, target_column):
    for i in dataframe:
        for x in range(len(dataframe[i])):
            try:
                dataframe[i][x] = float(dataframe[i][x])
            except:
                dataframe[i][x] = np.nan
    #平均 .mean() 中位數 .median() 眾數 .argmax(np.bincount())
    for i in dataframe:
        dataframe.loc[target_column==1, i] = dataframe.loc[target_column==1, i].fillna(dataframe.loc[target_column==1, i].mode().iloc[0], inplace=False)
        dataframe.loc[target_column==0, i] = dataframe.loc[target_column==0, i].fillna(dataframe.loc[target_column==0, i].mode().iloc[0], inplace=False)
    return dataframe

In [None]:
fillna_mode(heart_df, heart_df['target'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_3,cp_4,slope_1,slope_2,slope_3,slope_?,thal_3,thal_6,thal_7,thal_?
0,63,1,140.0,260.0,0.0,1.0,112.0,1.0,3.0,0.0,...,0,1,0,1,0,0,0,0,0,1
1,44,1,130.0,209.0,0.0,1.0,127.0,0.0,0.0,0.0,...,0,1,0,0,0,1,0,0,0,1
2,60,1,132.0,218.0,0.0,1.0,140.0,1.0,1.5,0.0,...,0,1,0,0,1,0,0,0,0,1
3,55,1,142.0,228.0,0.0,1.0,149.0,1.0,2.5,0.0,...,0,1,1,0,0,0,0,0,0,1
4,66,1,110.0,213.0,1.0,2.0,99.0,1.0,1.3,0.0,...,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,45,1,110.0,264.0,0.0,0.0,132.0,0.0,1.2,0.0,...,0,0,0,1,0,0,0,0,1,0
916,68,1,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,...,0,1,0,1,0,0,0,0,1,0
917,57,1,130.0,131.0,0.0,0.0,115.0,1.0,1.2,1.0,...,0,1,0,1,0,0,0,0,1,0
918,57,0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,1.0,...,0,0,0,1,0,0,1,0,0,0


In [None]:
print(heart_df.isnull().sum())

age         0
sex         0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
ca          0
target      0
cp_1        0
cp_2        0
cp_3        0
cp_4        0
slope_1     0
slope_2     0
slope_3     0
slope_?     0
thal_3      0
thal_6      0
thal_7      0
thal_?      0
dtype: int64


In [None]:
x_data = heart_df.drop(['target'], axis=1)
y_data = heart_df['target']

In [None]:
from sklearn.model_selection import train_test_split
heart_x_train, heart_x_test, heart_y_train, heart_y_test = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0)

In [None]:
best_model = find_best_model(heart_x_train, heart_x_test, heart_y_train, heart_y_test)


                 Model  Score
0                  GBM  79.35
1              XGBoost  78.26
2             Catboost  77.72
3  Logistic Regression  77.17
4             LightGBM  76.09
5            HistBoost  76.09
6        Random Forest  72.83
7        Decision Tree  67.39

The Best Model:
GBM


In [None]:
from sklearn.model_selection import GridSearchCV

clf = GradientBoostingClassifier(random_state = 42)

param_grid = {      'max_depth': [10, 15, 20, 25, 30], # 
              'learning_rate': [0.005, 0.01, 0.02, 0.05, 0.1]}

grid = GridSearchCV(clf, param_grid, cv=10, scoring='roc_auc')
grid.fit(heart_x_train,heart_y_train)

print("Best parameters ",grid.best_params_)
print("roc_auc :",grid.best_score_)

grid_best = grid.best_estimator_
grid_best.fit(heart_x_train, heart_y_train)
acc = grid.score(heart_x_test,heart_y_test)*100

print("GBM Algorithm Accuracy Score : {:.2f}%".format(acc))

Best parameters  {'learning_rate': 0.1, 'max_depth': 10}
roc_auc : 0.8388996674057649
GBM Algorithm Accuracy Score : 87.32%


In [None]:
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(random_state = 42)
param_grid = {'n_estimators':[7,8,9,10,11,15,20,50,100,200],
        'class_weight':[None,{0: 0.33,1:0.67}]}

grid = GridSearchCV(clf, param_grid, cv=10, scoring='roc_auc')
grid.fit(heart_x_train,heart_y_train)

print("Best parameters ",grid.best_params_)
print("roc_auc :",grid.best_score_)

grid_best = grid.best_estimator_
grid_best.fit(heart_x_train, heart_y_train)
acc = grid.score(heart_x_test,heart_y_test)*100

#accuracies['Random Forest'] = acc
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(acc))


Best parameters  {'class_weight': {0: 0.33, 1: 0.67}, 'n_estimators': 100}
roc_auc : 0.904620288248337
Random Forest Algorithm Accuracy Score : 89.13%


In [52]:
from sklearn.metrics import roc_auc_score, confusion_matrix
clf = RandomForestClassifier(n_estimators=100, max_features= 'log2', max_depth = 8, criterion = 'entropy' ).fit(heart_x_train, heart_y_train)
    
y_pred = clf.predict(heart_x_test)
y_score= clf.predict_proba(heart_x_test)

roc_auc =  roc_auc_score(heart_y_test, y_score[:,1])

print('roc_auc: ', roc_auc)
print('CM: ', confusion_matrix(heart_y_test, y_pred))

roc_auc:  0.8996763754045307
CM:  [[67 14]
 [14 89]]


In [53]:
import joblib
joblib.dump(clf,'heart_model.pkl')

['heart_model.pkl']

#Kidney_data

In [None]:
kidney_df = pd.read_csv('kidney_disease.csv')
kidney_df

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [None]:
data_check(kidney_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [None]:
#yes/no; abnormal/normal;present/notpresent;good/poor都转换为0/1
kidney_df[['htn','dm','cad','pe','ane']] = kidney_df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
kidney_df[['rbc','pc']] = kidney_df[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
kidney_df[['pcc','ba']] = kidney_df[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
kidney_df[['appet']] = kidney_df[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
kidney_df['classification'] = kidney_df['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
kidney_df = kidney_df.drop('id', axis=1)
kidney_df.rename(columns={'classification':'target'},inplace=True)
kidney_df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,target
0,48.0,80.0,1.020,1.0,0.0,,0.0,0.0,0.0,121.0,...,44,7800,5.2,1.0,1,0,1.0,0.0,0.0,1.0
1,7.0,50.0,1.020,4.0,0.0,,0.0,0.0,0.0,,...,38,6000,,0.0,0,0,1.0,0.0,0.0,1.0
2,62.0,80.0,1.010,2.0,3.0,0.0,0.0,0.0,0.0,423.0,...,31,7500,,0.0,1,0,0.0,0.0,1.0,1.0
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,...,32,6700,3.9,1.0,0,0,0.0,1.0,1.0,1.0
4,51.0,80.0,1.010,2.0,0.0,0.0,0.0,0.0,0.0,106.0,...,35,7300,4.6,0.0,0,0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,140.0,...,47,6700,4.9,0.0,0,0,1.0,0.0,0.0,0.0
396,42.0,70.0,1.025,0.0,0.0,0.0,0.0,0.0,0.0,75.0,...,54,7800,6.2,0.0,0,0,1.0,0.0,0.0,0.0
397,12.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,100.0,...,49,6600,5.4,0.0,0,0,1.0,0.0,0.0,0.0
398,17.0,60.0,1.025,0.0,0.0,0.0,0.0,0.0,0.0,114.0,...,51,7200,5.9,0.0,0,0,1.0,0.0,0.0,0.0


In [None]:
kidney_df['pe'] = kidney_df['pe'].replace(to_replace='good',value=0)
kidney_df['appet'] = kidney_df['appet'].replace(to_replace='no',value=0)
kidney_df['cad'] = kidney_df['cad'].replace(to_replace='\tno',value=0)
kidney_df['dm'] = kidney_df['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
kidney_df = kidney_df.replace(to_replace={'.*?.*':np.nan})
kidney_df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,target
0,48.0,80.0,1.020,1.0,0.0,,0.0,0.0,0.0,121.0,...,44,7800,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,7.0,50.0,1.020,4.0,0.0,,0.0,0.0,0.0,,...,38,6000,,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,62.0,80.0,1.010,2.0,3.0,0.0,0.0,0.0,0.0,423.0,...,31,7500,,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,...,32,6700,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1.0
4,51.0,80.0,1.010,2.0,0.0,0.0,0.0,0.0,0.0,106.0,...,35,7300,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,140.0,...,47,6700,4.9,0.0,0.0,0.0,1.0,0.0,0.0,0.0
396,42.0,70.0,1.025,0.0,0.0,0.0,0.0,0.0,0.0,75.0,...,54,7800,6.2,0.0,0.0,0.0,1.0,0.0,0.0,0.0
397,12.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,100.0,...,49,6600,5.4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
398,17.0,60.0,1.025,0.0,0.0,0.0,0.0,0.0,0.0,114.0,...,51,7200,5.9,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#查缺值
print(kidney_df.isnull().sum())
print(kidney_df[kidney_df['age'].isnull()])
#刪除缺值
kidney_df.dropna(how='any').shape  #how='all'

In [None]:
for i in kidney_df:
    #print(i)
    for x in range(len(kidney_df[i])):
        #print(kidney_df[i][x])
        try:
            kidney_df[i][x] = float(kidney_df[i][x])
            #print(i ,kidney_df[i][x], type(kidney_df[i][x]))
        except:
            kidney_df[i][x] = np.nan
            #pass


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [None]:
for i in kidney_df:
    kidney_df.loc[kidney_df['target']==1, i] = kidney_df.loc[kidney_df['target']==1, i].fillna(kidney_df.loc[kidney_df['target']==1, i].mean(), inplace=False)
    kidney_df.loc[kidney_df['target']==0, i] = kidney_df.loc[kidney_df['target']==0, i].fillna(kidney_df.loc[kidney_df['target']==0, i].mean(), inplace=False)

In [None]:
print(kidney_df.isnull().sum())

age       0
bp        0
sg        0
al        0
su        0
rbc       0
pc        0
pcc       0
ba        0
bgr       0
bu        0
sc        0
sod       0
pot       0
hemo      0
pcv       0
wc        0
rc        0
htn       0
dm        0
cad       0
appet     0
pe        0
ane       0
target    0
dtype: int64


In [None]:
x_data = kidney_df.drop(['target'], axis=1)
y_data = kidney_df['target']
kidney_x_train, kidney_x_test, kidney_y_train,  kidney_y_test = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0)

In [None]:
kidney_x_train

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane
336,25.000000,60.0,1.020000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,119.000000,...,15.200000,40.0,9200.0,5.2,0.0,0.0,0.0,1.0,0.0,0.0
64,55.000000,80.0,1.010000,0.000000,0.00000,0.439252,0.000000,0.0,0.0,146.000000,...,9.800000,32.939891,9069.536424,3.945238,0.0,0.0,0.0,1.0,0.0,0.0
55,35.000000,80.0,1.005000,3.000000,0.00000,1.000000,0.000000,0.0,0.0,175.419811,...,9.500000,28.0,9069.536424,3.945238,0.0,0.0,0.0,1.0,1.0,0.0
106,50.000000,90.0,1.013918,1.722488,0.76699,0.439252,0.391753,0.0,0.0,89.000000,...,6.000000,17.0,6500.0,3.945238,1.0,1.0,0.0,1.0,1.0,1.0
300,45.000000,60.0,1.020000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,114.000000,...,15.000000,43.0,9200.0,5.8,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,43.000000,80.0,1.025000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,130.000000,...,15.900000,45.0,7800.0,4.5,0.0,0.0,0.0,1.0,0.0,0.0
192,46.000000,110.0,1.015000,0.000000,0.00000,0.439252,0.000000,0.0,0.0,130.000000,...,10.647549,32.939891,9069.536424,3.945238,0.0,0.0,0.0,1.0,0.0,0.0
117,54.541322,70.0,1.020000,0.000000,0.00000,0.439252,0.391753,0.0,0.0,219.000000,...,12.500000,37.0,9800.0,4.4,0.0,0.0,0.0,1.0,0.0,0.0
47,11.000000,80.0,1.010000,3.000000,0.00000,0.439252,0.000000,0.0,0.0,175.419811,...,15.000000,45.0,8600.0,3.945238,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
find_best_model(kidney_x_train, kidney_x_test, kidney_y_train,  kidney_y_test)

                 Model  Score
0        Decision Tree  100.0
1        Random Forest  100.0
2              XGBoost  100.0
3                  GBM  100.0
4             LightGBM  100.0
5             Catboost  100.0
6            HistBoost  100.0
7  Logistic Regression   97.5

The Best Model:
Decision Tree


'Decision Tree'

In [None]:
tuned_parameters= [{'n_estimators':[7,8,9,10,11,12,13,14,15,16],'max_depth':[2,3,4,5,6,None],'class_weight':[None,{0: 0.33,1:0.67},'balanced'],'random_state':[42]}]
clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=10,scoring='f1')
clf.fit(kidney_x_train, kidney_y_train)
print('Best parameters:')
print(clf.best_params_)

accuracies = {}
rf = RandomForestClassifier(class_weight=None, max_depth= 6,n_estimators = 7, random_state = 42)
rf.fit(kidney_x_train, kidney_y_train)
acc = rf.score(kidney_x_test, kidney_y_test)*100
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(acc))


Best parameters:
{'class_weight': None, 'max_depth': 4, 'n_estimators': 16, 'random_state': 42}
Random Forest Algorithm Accuracy Score : 100.00%


In [50]:
from sklearn.metrics import roc_auc_score, confusion_matrix
clf = RandomForestClassifier(n_estimators=100, max_features= 'log2', max_depth = 8, criterion = 'entropy' ).fit(kidney_x_train, kidney_y_train)
    
y_pred = clf.predict(kidney_x_test)
y_score= clf.predict_proba(kidney_x_test)

roc_auc =  roc_auc_score(kidney_y_test, y_score[:,1])

print('roc_auc: ', roc_auc)
print('CM: ', confusion_matrix(kidney_y_test, y_pred))

roc_auc:  1.0
CM:  [[28  0]
 [ 0 52]]


In [51]:
import joblib
joblib.dump(clf,'kidney_model.pkl')

['kidney_model.pkl']