In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

# Q1:檔案讀入

In [None]:
health_data = pd.read_csv("HealthCheck.csv")

In [None]:
print(health_data)

      Race  Height_cm  Weight_kg Over70yr
0    White      186.0       90.0      Yes
1  African      185.0       98.0       No
2    Asian      175.0       80.0       No
3  African      170.0       75.0      Yes
4    Asian      164.0        NaN       No
5    Asian      170.0       72.0      Yes
6    White      178.0       75.0       No
7    White        NaN       89.0      Yes
8  African      165.0       79.0      Yes


#Q2:「自變數 X」與「應變數 Y」切分


In [None]:
X = health_data.iloc[:,:-1].values
Y = health_data.iloc[:,3].values
print("自變數X:\n",X)
print("應變數Y:\n",Y)

自變數X:
 [['White' 186.0 90.0]
 ['African' 185.0 98.0]
 ['Asian' 175.0 80.0]
 ['African' 170.0 75.0]
 ['Asian' 164.0 nan]
 ['Asian' 170.0 72.0]
 ['White' 178.0 75.0]
 ['White' nan 89.0]
 ['African' 165.0 79.0]]
應變數Y:
 ['Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes']


#Q3:填補缺失資料

In [None]:
# 先判斷是否有資料缺失發生
health_data.isnull().any()

Unnamed: 0,0
Race,False
Height_cm,True
Weight_kg,True
Over70yr,False


In [None]:
#實際填補資料缺失
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
print(X)

[['White' 186.0 90.0]
 ['African' 185.0 98.0]
 ['Asian' 175.0 80.0]
 ['African' 170.0 75.0]
 ['Asian' 164.0 82.25]
 ['Asian' 170.0 72.0]
 ['White' 178.0 75.0]
 ['White' 174.125 89.0]
 ['African' 165.0 79.0]]


# Q4:類別資料數位化

In [None]:
#自變數數位化
ary_dum = pd.get_dummies(X[:, 0]).values
X = np.concatenate((ary_dum,X[:, 1:3]),axis=1).astype("float64")
print(X)

[[  0.      0.      1.    186.     90.   ]
 [  1.      0.      0.    185.     98.   ]
 [  0.      1.      0.    175.     80.   ]
 [  1.      0.      0.    170.     75.   ]
 [  0.      1.      0.    164.     82.25 ]
 [  0.      1.      0.    170.     72.   ]
 [  0.      0.      1.    178.     75.   ]
 [  0.      0.      1.    174.125  89.   ]
 [  1.      0.      0.    165.     79.   ]]


In [None]:
#應變數數位化
labelEncoder = LabelEncoder()
Y = labelEncoder.fit_transform(Y).astype("float64")
print(Y)

[1. 0. 0. 1. 0. 1. 0. 1. 1.]


# Q5:切分測試集

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Q6:特徵縮放

In [None]:
sc_X = StandardScaler().fit(X_train)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)

# Q7:印出結果

In [None]:
print("自變數訓練集:")
print(X_train,"\n")
print("應變數訓練集:")
print(Y_train,"\n")
print("自變數測試集:")
print(X_test,"\n")
print("應變數測試集:")
print(Y_test,"\n")

自變數訓練集:
[[ 1.15470054 -0.63245553 -0.63245553  1.30941777  1.89111602]
 [-0.8660254   1.58113883 -0.63245553 -1.19037979  0.07416141]
 [ 1.15470054 -0.63245553 -0.63245553 -1.07134181 -0.30076573]
 [-0.8660254  -0.63245553  1.58113883  0.47615192 -0.76221452]
 [ 1.15470054 -0.63245553 -0.63245553 -0.47615192 -0.76221452]
 [-0.8660254  -0.63245553  1.58113883  1.42845575  0.96821844]
 [-0.8660254   1.58113883 -0.63245553 -0.47615192 -1.10830111]] 

應變數訓練集:
[0. 0. 1. 0. 1. 1. 1.] 

自變數測試集:
[[-0.8660254  -0.63245553  1.58113883  0.01487975  0.85285625]
 [-0.8660254   1.58113883 -0.63245553  0.11903798 -0.18540353]] 

應變數測試集:
[1. 0.] 

