In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 1.파일불러오기

In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df_count = df[df != 0]
df_count.apply(lambda x: x.count())

Pregnancies                 657
Glucose                     763
BloodPressure               733
SkinThickness               541
Insulin                     394
BMI                         757
DiabetesPedigreeFunction    768
Age                         768
Outcome                     268
dtype: int64

# 2.전처리
#### 2-1.Pregnancies와 Outcome을 제외한 0값들을 NaN으로 대체하기

In [4]:
list_columns = list(df.columns)
list_columns.remove('Pregnancies')
list_columns.remove('Outcome')
for target in list_columns:
    df.replace({target:0}, {target:np.nan}, inplace=True)

#### 2-2.NaN값이 2개 이상의 칼럼에 있을경우 삭제

In [5]:
allow_nan = 1
df = df.dropna(thresh=len(df.columns) - allow_nan)

#### 2-3.인덱스 구멍 메우기

In [6]:
df.index = range(1, len(df) + 1)

In [7]:
df.head(100)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148.0,72.0,35.0,,33.6,0.627,50,1
2,1,85.0,66.0,29.0,,26.6,0.351,31,0
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
6,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
7,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1
8,5,166.0,72.0,19.0,175.0,25.8,0.587,51,1
9,0,118.0,84.0,47.0,230.0,45.8,0.551,31,1
10,1,103.0,30.0,38.0,83.0,43.3,0.183,33,0


In [8]:
df.apply(lambda x : x.isnull().sum())

Pregnancies                   0
Glucose                       1
BloodPressure                 0
SkinThickness                 0
Insulin                     140
BMI                           1
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

# 3.BloodPressure 의 0인 값처리

In [9]:
category1 = df[["BloodPressure", "Age"]]
category1_dropna = category1.dropna()
lm1 = LinearRegression()
# lm1.fit(np.array(category1_dropna.Age).reshape(-1, 1), np.array(category1_dropna.BloodPressure).reshape(-1, 1))
X = category1_dropna[["Age"]]
y = category1_dropna[["BloodPressure"]]
X_train, X_test, y_train, y_test = train_test_split(X, y)
lm1.fit(X_train, y_train)
print("처리전 :", df["BloodPressure"].isnull().sum())
for index in range(1, len(df) + 1):
    check_row = df.loc[index].isna()
    if (check_row.BloodPressure == True) & (check_row.Age == False):
        df.loc[index, "BloodPressure"] = lm1.predict([[df.loc[index, "Age"]]])[0][0]
print("처리후 :", df["BloodPressure"].isnull().sum())

처리전 : 0
처리후 : 0


# 4.BMI의 0인 값처리

In [10]:
#BMI의 0인 값처리
category2 = df[["BMI", "Age", "BloodPressure"]]
category2_dropna = category2.dropna()
lm2 = LinearRegression()
X = category2_dropna[["Age", "BloodPressure"]]
y = category2_dropna[["BMI"]]
X_train, X_test, y_train, y_test = train_test_split(X, y)
lm2.fit(X_train, y_train)
print("처리전 :", df["BMI"].isnull().sum())
for index in range(1, len(df) + 1):
    check_row = df.loc[index].isna()
    if (check_row.BMI == True) & (check_row.BloodPressure == False) & (check_row.Age == False):
        df.loc[index, "BMI"] = lm2.predict([[df.loc[index, "Age"], df.loc[index, "BloodPressure"]]])[0]
print("처리후 :", df["BMI"].isnull().sum())

처리전 : 1
처리후 : 0


# 5. Insulin 의 0인 값처리

In [11]:
#Insulin 의 0인 값처리
category3 = df[["DiabetesPedigreeFunction", "Insulin"]]
category3_dropna = category3.dropna()
lm3 = LinearRegression()
X = category1_dropna[["Age"]]
y = category1_dropna[["BloodPressure"]]
X_train, X_test, y_train, y_test = train_test_split(X, y)
lm3.fit(X_train, y_train)
print("처리전 :", df["Insulin"].isnull().sum())
for index in range(1, len(df) + 1):
    check_row = df.loc[index].isna()
    if (check_row.Insulin == True) & (check_row.DiabetesPedigreeFunction == False):
        df.loc[index, "Insulin"] = lm3.predict([[df.loc[index, "DiabetesPedigreeFunction"]]])[0][0]
print("처리후 :", df["Insulin"].isnull().sum())

처리전 : 140
처리후 : 0


# 6.이외의  컬럼은 최빈값으로 처리

In [12]:
#이외의  컬럼은 최빈값으로 처리
print("처리전 :")
print(df.apply(lambda x: x.isnull().sum()))
df = df.apply(lambda x: x.fillna(x.mode()[0]))
print("\n처리후 :")
print(df.apply(lambda x: x.isnull().sum()))

처리전 :
Pregnancies                 0
Glucose                     1
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

처리후 :
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


# ※Outcome에 대한 연관정도

In [13]:
df.apply(lambda series: df["Outcome"].corr(series))

Pregnancies                 0.254352
Glucose                     0.503948
BloodPressure               0.183728
SkinThickness               0.256301
Insulin                     0.238539
BMI                         0.301653
DiabetesPedigreeFunction    0.226167
Age                         0.316834
Outcome                     1.000000
dtype: float64

## 7.결과값으로 Outcome가 나올수 있도록 로지스틱 회귀분석

In [14]:
list_columns = list(df.columns)
list_columns.remove('Outcome')
X = df[list_columns]
y = df["Outcome"]
# model = LogisticRegression()
model = LogisticRegression(solver='liblinear')
X_train, X_test, y_train, y_test = train_test_split(X, y)
model.fit(X_train, y_train)
model.predict(X_test)
taget_val = model.predict(X)


In [15]:
taget = pd.Series(taget_val, index = df.index)
taget

1      1
2      0
3      0
4      1
5      0
6      1
7      1
8      1
9      0
10     1
11     0
12     0
13     0
14     1
15     1
16     0
17     1
18     0
19     1
20     0
21     1
22     0
23     0
24     0
25     1
26     1
27     0
28     1
29     1
30     0
      ..
505    0
506    0
507    0
508    0
509    1
510    0
511    0
512    0
513    0
514    0
515    1
516    0
517    0
518    1
519    0
520    0
521    0
522    1
523    0
524    0
525    0
526    1
527    0
528    0
529    0
530    1
531    1
532    0
533    0
534    0
Length: 534, dtype: int64

In [16]:
df.apply(lambda x : x.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [17]:
taget.corr(df.Outcome)

0.4725965960586082

In [20]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148.0,72.0,35.0,59.574287,33.6,0.627,50,1
2,1,85.0,66.0,29.0,59.470963,26.6,0.351,31,0
3,1,89.0,66.0,23.0,94.000000,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.000000,43.1,2.288,33,1
5,3,78.0,50.0,32.0,88.000000,31.0,0.248,26,1
6,2,197.0,70.0,45.0,543.000000,30.5,0.158,53,1
7,1,189.0,60.0,23.0,846.000000,30.1,0.398,59,1
8,5,166.0,72.0,19.0,175.000000,25.8,0.587,51,1
9,0,118.0,84.0,47.0,230.000000,45.8,0.551,31,1
10,1,103.0,30.0,38.0,83.000000,43.3,0.183,33,0
