In [63]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, normalize, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [64]:
#Read and Print diabes.csv out screen
df = pd.read_csv('diabetes.csv')
print(df.head().to_string())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72             35        0  33.6                     0.627   50        1
1            1       85             66             29        0  26.6                     0.351   31        0
2            8      183             64              0        0  23.3                     0.672   32        1
3            1       89             66             23       94  28.1                     0.167   21        0
4            0      137             40             35      168  43.1                     2.288   33        1


In [65]:
#Drop Columns
x = df.drop(columns=['Outcome'], axis=1)
x_copy=x.copy()
y = df['Outcome']
y_copy=y.copy()

In [66]:
#Print missing data
print("Print the missing value contains ",df.isnull().sum())
feature_mising_value = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
x[feature_mising_value] = x[feature_mising_value].replace(0,np.nan)
print("Print null data",x.isnull().sum())

Print the missing value contains  Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
Print null data Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64


In [67]:
#Imputer
imputer = SimpleImputer(missing_values=np.nan,strategy='median')
x=imputer.fit_transform(x)
x=pd.DataFrame(x,columns=x_copy.columns)
print(x.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64


In [68]:
#outliner
for col in x.columns:
    lower_limit=x[col].mean()-3*x[col].std()
    upper_limit=x[col].mean()+3*x[col].std()
    print(f'lower_limit of {col} is: {lower_limit} ')
    print(f'upper_limit of {col} is: {upper_limit} ')
    for index in range(len(x.index) - 1):
        if x.loc[index,col] > upper_limit or x.loc[index,col] < lower_limit:
            x.loc[index, col] = np.nan
    num_outlier = x[col].isnull().sum()
    print(f'num of outlier for {col} is: {num_outlier}')
imputer=SimpleImputer(missing_values=np.nan,strategy='median')
x=imputer.fit_transform(x)
x=pd.DataFrame(x,columns=x_copy.columns)

lower_limit of Pregnancies is: -6.263682104763275 
upper_limit of Pregnancies is: 13.953786271429943 
num of outlier for Pregnancies is: 4
lower_limit of Glucose is: 30.341392532754483 
upper_limit of Glucose is: 212.97110746724553 
num of outlier for Glucose is: 0
lower_limit of BloodPressure is: 36.096793548065584 
upper_limit of BloodPressure is: 108.67664395193441 
num of outlier for BloodPressure is: 8
lower_limit of SkinThickness is: 2.734409847397547 
upper_limit of SkinThickness is: 55.481735985935785 
num of outlier for SkinThickness is: 4
lower_limit of Insulin is: -118.47730407954299 
upper_limit of Insulin is: 399.821054079543 
num of outlier for Insulin is: 20
lower_limit of BMI is: 11.829677879090347 
upper_limit of BMI is: 53.080738787576315 
num of outlier for BMI is: 5
lower_limit of DiabetesPedigreeFunction is: -0.5221094829549915 
upper_limit of DiabetesPedigreeFunction is: 1.4658620871216579 
num of outlier for DiabetesPedigreeFunction is: 11
lower_limit of Age is: 

In [69]:
#Feature Extraction
x['Pregnancies/Age']= x['Pregnancies']/x['Age']

x.loc[(x['BMI']<18.5), 'BMI_Range'] = 'underweight'
x.loc[(x["BMI"]>= 18.5) & (x['BMI']<24.9), 'BMI_Range'] = 'HealthyWeight'
x.loc[(x["BMI"]>=24.9) & (x['BMI']<29.9), 'BMI_Range']= 'overweight'
x.loc[(x["BMI"]>=29.9), 'BMI_Range']= 'obese'

x.loc[(x['Age']<25), 'Age_Range']= 'young'
x.loc[(x["Age"]>= 25) & (x['Age']<40), 'Age_Range']= 'middle'
x.loc[(x["Age"]>=40), 'Age_Range']= 'old'

x.loc[(x["Glucose"] < 70), 'Glucose_Range'] ="Hipoglisemi"
x.loc[(x["Glucose"] >= 70) & (x['Glucose'] < 100) , 'Glucose_Range'] ="Normal"
x.loc[(x["Glucose"] >= 100) & (x['Glucose'] < 125) , 'Glucose_Range'] ="Imparied_Glucose"
x.loc[(x["Glucose"] >= 125), 'Glucose_Range'] ="Hiperglisemi"

x['BMI/Glucose']= x['BMI']/x['Glucose']
x['Insulin/Glucose']= x['Insulin']/x['Glucose']

x_extraction= x

  x.loc[(x['BMI']<18.5), 'BMI_Range'] = 'underweight'
  x.loc[(x['Age']<25), 'Age_Range']= 'young'
  x.loc[(x["Glucose"] < 70), 'Glucose_Range'] ="Hipoglisemi"


In [70]:
print(x_extraction)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6.0    148.0           72.0           35.0    125.0  33.6   
1            1.0     85.0           66.0           29.0    125.0  26.6   
2            8.0    183.0           64.0           29.0    125.0  23.3   
3            1.0     89.0           66.0           23.0     94.0  28.1   
4            0.0    137.0           40.0           35.0    168.0  43.1   
..           ...      ...            ...            ...      ...   ...   
763         10.0    101.0           76.0           48.0    180.0  32.9   
764          2.0    122.0           70.0           27.0    125.0  36.8   
765          5.0    121.0           72.0           23.0    112.0  26.2   
766          1.0    126.0           60.0           29.0    125.0  30.1   
767          1.0     93.0           70.0           31.0    125.0  30.4   

     DiabetesPedigreeFunction   Age  Pregnancies/Age      BMI_Range Age_Range  \
0                       0.627 

In [71]:
#Encoding
orinal_encoder = OrdinalEncoder()
x[['BMI_Range', 'Age_Range', 'Glucose_Range']]=orinal_encoder.fit_transform(x[['BMI_Range', 'Age_Range', 'Glucose_Range' ]])
print("Orinal Encoder\n",x)

In [72]:
#Normalization
x=normalize(x,norm='l1',axis=0)
x=pd.DataFrame(x,columns=x_extraction.columns)
print("Normalization\n",x)

In [73]:
#Split the test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [74]:
#Standarization
scaler_ti = StandardScaler()
x_train[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] = scaler_ti.fit_transform(x_train[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']])
x_test[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] = scaler_ti.fit_transform(x_test[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']])

In [75]:
ppn_clf = Perceptron(eta0=0.01,max_iter=100,random_state=1)
ppn_clf.fit(x_train,y_train)

ValueError: could not convert string to float: 'obese'

In [76]:
cross_val_score(ppn_clf, x_train, y_train, cv=3, scoring='accuracy')
y_pred = ppn_clf.predict(x_test)
y_score_ppn = cross_val_predict(ppn_clf, x_train, y_train, cv=3, method='decision_function')
report = classification_report(y_test,y_pred)

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/home/anhdung/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/anhdung/.local/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/anhdung/.local/lib/python3.10/site-packages/sklearn/linear_model/_stochastic_gradient.py", line 907, in fit
    return self._fit(
  File "/home/anhdung/.local/lib/python3.10/site-packages/sklearn/linear_model/_stochastic_gradient.py", line 694, in _fit
    self._partial_fit(
  File "/home/anhdung/.local/lib/python3.10/site-packages/sklearn/linear_model/_stochastic_gradient.py", line 586, in _partial_fit
    X, y = self._validate_data(
  File "/home/anhdung/.local/lib/python3.10/site-packages/sklearn/base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/anhdung/.local/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1146, in check_X_y
    X = check_array(
  File "/home/anhdung/.local/lib/python3.10/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/home/anhdung/.local/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/home/anhdung/.local/lib/python3.10/site-packages/pandas/core/generic.py", line 2084, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'obese'


In [77]:
print("Report",report)

Report               precision    recall  f1-score   support

           0       0.87      0.62      0.72       146
           1       0.56      0.84      0.67        85

    accuracy                           0.70       231
   macro avg       0.71      0.73      0.69       231
weighted avg       0.75      0.70      0.70       231



In [62]:
print(f'roc_auc_score of perceptron {roc_auc_score(y_train, y_score_ppn)}')
print(f'accuracy score of perceptron {accuracy_score(y_test,y_pred)}')

roc_auc_score of perceptron 0.6496712049643419
accuracy score of perceptron 0.696969696969697
