In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
# !pip install missingno
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

The dataset is part of the large dataset held at the National Institutes of Diabetes-Digestive-Kidney Diseases in the USA.The research is conducted on the "diabetes research" conducted for "diabetes", which is used as a variable for PimaIndian women living in Phoenix, the 5th largest city in the State of Arizona in the USA. 1 indicates positive diabetes test result, 0 indicates negative.

In [4]:
def read_data():
    data = pd.read_csv("diabetes.csv")
    return data

In [5]:
df = read_data()
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845,3.37,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.895,31.973,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105,19.356,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536,15.952,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799,115.244,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.993,7.884,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.472,0.331,0.078,0.244,0.372,0.626,2.42
Age,768.0,33.241,11.76,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.349,0.477,0.0,0.0,0.0,1.0,1.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Capture the numeric and categorical variables.Perform target variable analysis. (The mean of the target variable according to the categorical variables, the mean of the numeric variables according to the target variable)

In [28]:
def grab_colname(dataframe,cat_th=10,car_th=20):
  num_cols = [col for col in dataframe.columns if dataframe[col].dtype != "O"]
  num_but_cats = [ col for col in num_cols if dataframe[col].nunique()<cat_th]
  num_cols=[col for col in num_cols if col not in num_but_cats]
  cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == "O"]
  cat_cols = cat_cols + num_but_cats
  print(f'Observation:{dataframe.shape[0]}')
  print(f'Variables:{dataframe.shape[1]}')
  print(f'num_cols:{len(num_cols)}')
  print(f'cat_cols:{len(cat_cols)}')
  return num_cols,cat_cols

In [30]:
num_cols , cat_cols = grab_colname(df)

Observation:768
Variables:9
num_cols:8
cat_cols:1


In [33]:
for col in num_cols:
    print(df.groupby("Outcome").agg({col:"mean"}))

         Pregnancies
Outcome             
0              3.298
1              4.866
         Glucose
Outcome         
0        109.980
1        141.257
         BloodPressure
Outcome               
0               68.184
1               70.825
         SkinThickness
Outcome               
0               19.664
1               22.164
         Insulin
Outcome         
0         68.792
1        100.336
           BMI
Outcome       
0       30.304
1       35.143
         DiabetesPedigreeFunction
Outcome                          
0                           0.430
1                           0.550
           Age
Outcome       
0       31.190
1       37.067


Perform outlier observation analysis.

In [55]:
def outlier_tresholds(dataframe,columns,q1=0.25,q3=0.75):
    quartile1 = dataframe[columns].quantile(q1)
    quartile3 = dataframe[columns].quantile(q3)
    iqr = quartile3 - quartile1
    low_limit = quartile1- 1.5*iqr
    up_limit = quartile3 + 1.5*iqr
    return low_limit,up_limit

In [56]:
def check_outlier(dataframe,col_name):
    low,up = outlier_tresholds(dataframe,col_name)
    if dataframe[(dataframe[col_name] < low) | (dataframe[col_name] > up)].any(axis=None):
        return True
    else:
        return False


In [57]:
for col in df.columns:
    print(f'columns:{col}',check_outlier(df,col))

columns:Pregnancies True
columns:Glucose True
columns:BloodPressure True
columns:SkinThickness True
columns:Insulin True
columns:BMI True
columns:DiabetesPedigreeFunction True
columns:Age True
columns:Outcome False


Make a missing observation analysis

In [63]:
def missing_values(dataframe):
    na_cols = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_cols].isnull().sum().sort_values(ascending = False)
    ratio = (dataframe[na_cols].isnull().sum()/dataframe.shape[0]*100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss,np.round(ratio,2)],axis=1,keys=['n_miss','ratio'])
    print(missing_df,end="\n")
    return na_cols
    


In [64]:
missing_values(df)

Empty DataFrame
Columns: [n_miss, ratio]
Index: []


[]

In [65]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Take necessary actions for missing and outlier values. There are no missing observations in the data set, but Glucose, Insulin etc. Observation units containing a value of 0 in the variables may represent the missing value. For example; a person's glucose or insulin value will not be 0. Considering this situation, you can assign the zero values to the relevant values as NaN and then apply the operations to the missing values.

In [135]:
for col in ["Glucose","BloodPressure","SkinThickness","Insulin","DiabetesPedigreeFunction","BMI","Age"]:
    df.loc[df[col] == 0, col] = np.nan

In [137]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50.0,1
1,1,85.0,66.0,29.0,,26.6,0.351,31.0,0
2,8,183.0,64.0,,,23.3,0.672,32.0,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


#### Create new variables.

In [147]:
df.loc[(df["Age"]>=21) & (df["Age"] < 51), "NEW_AGE"] ="Mature"



In [149]:
df.loc[(df["Age"]>=51),"NEW_AGE"]="Senior"

In [150]:
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NEW_AGE
0,6,148.0,72.0,35.0,,33.6,0.627,50.0,1,Mature
1,1,85.0,66.0,29.0,,26.6,0.351,31.0,0,Mature
2,8,183.0,64.0,,,23.3,0.672,32.0,1,Mature
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0,Mature
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1,Mature
5,5,116.0,74.0,,,25.6,0.201,30.0,0,Mature
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26.0,1,Mature
7,10,115.0,,,,35.3,0.134,29.0,0,Mature
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53.0,1,Senior
9,8,125.0,96.0,,,,0.232,54.0,1,Senior


#### Perform encoding operations

In [152]:
le = LabelEncoder()


In [155]:
le.fit_transform(df["NEW_AGE"])[0:10]
le.inverse_transform([0,1])

array(['Mature', 'Senior'], dtype=object)

#### Standardize for numeric variables

In [158]:
ss = StandardScaler()
df[num_cols] = ss.fit_transform(df[num_cols])
df[num_cols].head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.64,0.862,-0.033,0.559,,0.165,0.468,1.426
1,-0.845,-1.202,-0.518,-0.015,,-0.846,-0.365,-0.191
2,1.234,2.009,-0.679,,,-1.323,0.604,-0.106
3,-0.845,-1.071,-0.518,-0.588,-0.519,-0.63,-0.921,-1.042
4,-1.142,0.502,-2.619,0.559,0.105,1.538,5.485,-0.02


In [159]:
ss = StandardScaler()
df["Age_standard_Scaler"] = ss.fit_transform(df[["Age"]])