### 지도학습(Supervised Learning)
- 목표변수(= target, Y)가 있는 학습법

In [4]:
import pandas as pd

In [5]:
df_BCD = pd.read_csv('../../../datasets/BreastCancerWisconsinDataSet.csv')
df_BCD[:2]

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,


#### 목표변수 설정 - 연속형
- 목표변수 : radius_mean
- 설명변수 : drop columns(radius_mean, id, diagnosis, Unnamed: 32)
    * radius_mean은 목표변수라서 drop
    * id는 분포가 아니라서 drop
    * diagnosis는 범주형이라 drop
    * Unnamed: 32 는 null이라 drop

In [6]:
df_BCD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [7]:
df_BCD.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

#### PreProcessing

##### 목표변수와 설명변수 추출

In [8]:
df_BCD_extract = df_BCD[['radius_mean', 'texture_mean', 'perimeter_mean']]
df_BCD_extract[:2]

Unnamed: 0,radius_mean,texture_mean,perimeter_mean
0,17.99,10.38,122.8
1,20.57,17.77,132.9


##### 결측치나 이상치 확인

In [9]:
# 결측치 확인
df_BCD_extract.isnull().sum()

radius_mean       0
texture_mean      0
perimeter_mean    0
dtype: int64

##### Structured data : X, Y
- 목표변수(=**Target**, Y)
- 설명변수(=**Label**,**feature**, X)

In [10]:
# 목표변수(target)와 설명변수(label) 분할하기
target = df_BCD_extract['radius_mean']
labels = df_BCD_extract[['texture_mean', 'perimeter_mean']]
target.shape, labels.shape

((569,), (569, 2))

#### 정형화

In [21]:
from sklearn.model_selection import train_test_split

In [27]:
# labels가 train, test 2개로 분할, target도 train, test 2개로 분할
# test_size=0.3, 0.3은 확률로 표현된 숫자
labels_train, labels_test, target_train, target_test = train_test_split(labels, target, random_state=111, test_size=0.3)
labels_train.shape, target_train.shape, labels_test.shape, target_test.shape

((398, 2), (398,), (171, 2), (171,))

#### 모델(알고리즘) 학습

##### 모델 학습
- target datatype 종속

In [11]:
from sklearn.linear_model import LinearRegression

In [28]:
model = LinearRegression()
model.fit(labels_train, target_train)   # 설명변수가 앞, 목표변수가 뒤 (이 데이터를 넣어서 모델을 학습시킴)

#### 평가

In [31]:
# train 평가
target_train_predict = model.predict(labels_train)
len(target_train_predict)  # target_train.shape의 개수와 동일해야 함

398

In [32]:
from sklearn.metrics import r2_score
r2_score(target_train, target_train_predict)

0.9961659957070211

In [34]:
# test 평가
target_test_predict = model.predict(labels_test)
target_test_predict.shape

(171,)

In [36]:
r2_score(target_test, target_test_predict)

# train 평가 : 0.9961659957070211
# test 평가 : 0.9946612956705342
# 차이가 많이 나지 않기 때문에 양호함

0.9946612956705342

In [41]:
# 추가 평가 도구 : MSE, RMSE, MAE (수치가 낮을수록 좋음)
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [42]:
# Mean Squared Error
mean_squared_error(target_train, target_train_predict) # train 평가

0.048215325513057454

In [45]:
# Root Mean Squared Error
mean_squared_error(target_train, target_train_predict)**0.5 # train 평가

0.2195798841266145

In [40]:
# Mean Absolute Error
mean_absolute_error(target_train, target_train_predict) # train 평가

0.16056038265140227

#### 미래예측(서비스 개시)

In [15]:
df_BCD_extract[50:55]

Unnamed: 0,radius_mean,texture_mean,perimeter_mean
50,11.76,21.6,74.72
51,13.64,16.34,87.21
52,11.94,18.24,75.71
53,18.22,18.7,120.3
54,15.1,22.02,97.26


In [16]:
# 51번째의 16.34, 87.21 : 외부에서 입력한 설명변수 2개의 값들
temp_label = [[16.34, 87.21]]

In [17]:
model.predict(temp_label)
# 51번째의 설명변수 2개를 넣어줬을 때 목표변수인 13.64와 비슷한 값인 13.45라는 예측된 값을 얻을 수 있다.



array([13.45096511])

#### 서비스 배포

In [18]:
import pickle   
# 모델은 인스턴스화된 class로 메모리에 올라가 있어서 프로그램이 종료되면 사라짐.
# 메모리 인스턴스 클래스를 이진 파일로 저장(이진수로 이루어진 파일로 그래야 컴퓨터가 이해할 수 있음)

In [20]:
# open('','') : ''앞은 파일 이름, ''뒤는 목적(read/write), wb: write binary(이진 파일로 저장)
# 메모리에 있던 model을 regression_file에 넣어줌
with open('../../../datasets/BreastCancerWisconsin_Regression.pkl','wb') as regression_file : 
    pickle.dump(obj=model, file=regression_file)
    pass