### 모델링 들어가기 전 하는 과정
- 전처리 -> 데이터세트 분리 -> 모델링


- 전처리 : 피처엔지니어링, 스케일링, 인코딩


- 데이터세트 분리 : train & test
- 데이터 분리하는 이유 : 예측한 모델을 단순하게 한 번 하고 끝내는 것이 아닌 '일반화'를 위해 진행
- '일반화'를 하기 위해서는 과적합을 피해야 함
- 수능(test), 모의고사(train)
- 과적합 : 과도한 데이터 학습 => 실제 train 데이터에서는 결과 good!  but, test 데이터에서는 결과 bad

In [3]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [4]:
import pandas as pd

In [5]:
iris = load_iris()

In [6]:
iris_data = iris.data

In [7]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [9]:
iris_label = iris.target
iris_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
iris_df = pd.DataFrame(data = iris_data, columns = iris.feature_names)
iris_df['y'] = iris.target

In [11]:
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),y
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


### 데이터 세트 분리
- train데이터로 학습된 모델의 성능 평가를 위해 test데이터 필요
    - sklearn에서 제공하는 train_test_split() API 제공
    - test(데이터분리 가능), size 조정 가능, 비중 조정 가능

In [12]:
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size = 0.3, random_state = 111)
# train : test = 0.7 : 0.3
# random_state ;  섞는 개념으로 이해할 것

In [13]:
X_train   # 4개의 컬럼들에 대해 들어감

array([[5.5, 2.6, 4.4, 1.2],
       [6.9, 3.1, 4.9, 1.5],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [5.7, 2.9, 4.2, 1.3],
       [5.1, 2.5, 3. , 1.1],
       [5.5, 2.4, 3.7, 1. ],
       [6. , 2.7, 5.1, 1.6],
       [5.7, 3.8, 1.7, 0.3],
       [6.1, 2.9, 4.7, 1.4],
       [4.5, 2.3, 1.3, 0.3],
       [4.8, 3.1, 1.6, 0.2],
       [4.9, 2.5, 4.5, 1.7],
       [4.9, 3.1, 1.5, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [6. , 3. , 4.8, 1.8],
       [5.5, 2.5, 4. , 1.3],
       [5.9, 3. , 4.2, 1.5],
       [5.9, 3. , 5.1, 1.8],
       [5.4, 3.9, 1.7, 0.4],
       [6.4, 3.2, 4.5, 1.5],
       [7.7, 3.8, 6.7, 2.2],
       [7.2, 3. , 5.8, 1.6],
       [5.2, 3.5, 1.5, 0.2],
       [6.2, 3.4, 5.4, 2.3],
       [5.1, 3.8, 1.9, 0.4],
       [6.8, 2.8, 4.8, 1.4],
       [6.4, 2.7, 5.3, 1.9],
       [5.5, 2.4, 3.8, 1.1],
       [6.9, 3.2, 5.7, 2.3],
       [6.8, 3.2, 5.9, 2.3],
       [6.6, 3. , 4.4, 1.4],
       [6. , 2.2, 4. , 1. ],
       [6.4, 2.8, 5.6, 2.1],
       [6.1, 2

In [14]:
X_test

array([[5.1, 3.4, 1.5, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [7.2, 3.6, 6.1, 2.5],
       [6.3, 2.7, 4.9, 1.8],
       [6.7, 3. , 5. , 1.7],
       [4.8, 3.4, 1.9, 0.2],
       [5.1, 3.5, 1.4, 0.2],
       [6.9, 3.1, 5.4, 2.1],
       [6.7, 3.3, 5.7, 2.5],
       [6.3, 3.3, 4.7, 1.6],
       [7.9, 3.8, 6.4, 2. ],
       [5.5, 3.5, 1.3, 0.2],
       [6. , 2.2, 5. , 1.5],
       [6.7, 3.3, 5.7, 2.1],
       [7.6, 3. , 6.6, 2.1],
       [5.7, 4.4, 1.5, 0.4],
       [6.2, 2.8, 4.8, 1.8],
       [4.9, 2.4, 3.3, 1. ],
       [5. , 3.2, 1.2, 0.2],
       [6.7, 3.1, 5.6, 2.4],
       [6.1, 2.8, 4.7, 1.2],
       [6.4, 3.1, 5.5, 1.8],
       [5.7, 2.8, 4.5, 1.3],
       [5.8, 2.7, 3.9, 1.2],
       [5.7, 2.5, 5. , 2. ],
       [5.1, 3.3, 1.7, 0.5],
       [5. , 3. , 1.6, 0.2],
       [5.9, 3.2, 4.8, 1.8],
       [4.7, 3.2, 1.3, 0.2],
       [6.3, 3.4, 5.6, 2.4],
       [4.6, 3.6, 1. , 0.2],
       [5.7, 3. , 4.2, 1.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.4, 3.9, 1.3, 0.4],
       [6.7, 3

In [15]:
y_train

array([1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 2, 0, 0, 2, 1, 1, 2, 0, 1, 2,
       2, 0, 2, 0, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 0, 2, 2, 0, 0, 1, 1, 2,
       2, 0, 2, 2, 0, 1, 1, 1, 1, 2, 2, 0, 1, 0, 1, 2, 2, 2, 2, 0, 0, 2,
       1, 0, 0, 2, 2, 1, 0, 1, 2, 0, 0, 2, 0, 1, 1, 2, 2, 1, 0, 1, 0, 1,
       2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 2, 1, 0, 1, 2, 1])

In [16]:
y_test

array([0, 0, 2, 2, 1, 0, 0, 2, 2, 1, 2, 0, 2, 2, 2, 0, 2, 1, 0, 2, 1, 2,
       1, 1, 2, 0, 0, 1, 0, 2, 0, 1, 0, 0, 1, 2, 2, 1, 0, 1, 1, 1, 1, 1,
       2])

In [19]:
df_clf = DecisionTreeClassifier(random_state = 111)

In [20]:
df_clf.fit(X_train, y_train)   # train데이터로 모델링

In [22]:
# 예측
pred = df_clf.predict(X_test)

# 예측값
pred

array([0, 0, 2, 2, 2, 0, 0, 2, 2, 1, 2, 0, 1, 2, 2, 0, 2, 1, 0, 2, 1, 2,
       1, 1, 2, 0, 0, 2, 0, 2, 0, 1, 0, 0, 1, 2, 2, 1, 0, 1, 1, 1, 1, 1,
       2])

In [26]:
# 실제값
y_test

array([0, 0, 2, 2, 1, 0, 0, 2, 2, 1, 2, 0, 2, 2, 2, 0, 2, 1, 0, 2, 1, 2,
       1, 1, 2, 0, 0, 1, 0, 2, 0, 1, 0, 0, 1, 2, 2, 1, 0, 1, 1, 1, 1, 1,
       2])

In [27]:
# 실제값 y_test와 비교

# 정확도 확인
from sklearn.metrics import accuracy_score
print('DT 정확도 :', accuracy_score(y_test, pred))

DT 정확도 : 0.9333333333333333


---------------------------------------

In [28]:
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [29]:
# 예측하고자 하는 값 (y) -> 생존여부 survived열

In [31]:
tt_df = titanic[['survived', 'pclass', 'sibsp', 'parch', 'fare']]
tt_df

Unnamed: 0,survived,pclass,sibsp,parch,fare
0,0,3,1,0,7.2500
1,1,1,1,0,71.2833
2,1,3,0,0,7.9250
3,1,1,1,0,53.1000
4,0,3,0,0,8.0500
...,...,...,...,...,...
886,0,2,0,0,13.0000
887,1,1,0,0,30.0000
888,0,3,1,2,23.4500
889,1,1,0,0,30.0000


In [32]:
tt_df.columns

Index(['survived', 'pclass', 'sibsp', 'parch', 'fare'], dtype='object')

In [33]:
tt_df_x = tt_df[['pclass', 'sibsp', 'parch', 'fare']]

In [35]:
len(tt_df_x)

891

In [34]:
tt_df_y = tt_df[['survived']]

In [36]:
len(tt_df_y)

891

In [37]:
X_train, X_test, y_train, y_test = train_test_split(tt_df_x, tt_df_y, test_size = 0.3, random_state = 111)

In [38]:
X_train

Unnamed: 0,pclass,sibsp,parch,fare
195,1,0,0,146.5208
150,2,0,0,12.5250
572,1,0,0,26.3875
134,2,0,0,13.0000
96,1,0,0,34.6542
...,...,...,...,...
275,1,1,0,77.9583
86,3,1,3,34.3750
724,1,1,0,53.1000
876,3,0,0,9.8458


In [39]:
y_train

Unnamed: 0,survived
195,1
150,0
572,1
134,0
96,0
...,...
275,1
86,0
724,1
876,0


In [40]:
X_test

Unnamed: 0,pclass,sibsp,parch,fare
374,3,3,1,21.0750
211,2,0,0,21.0000
258,1,0,0,512.3292
584,3,0,0,8.7125
461,3,0,0,8.0500
...,...,...,...,...
383,1,1,0,52.0000
719,3,0,0,7.7750
291,1,1,0,91.0792
432,2,1,0,26.0000


In [41]:
y_test

Unnamed: 0,survived
374,0
211,1
258,1
584,0
461,0
...,...
383,1
719,0
291,1
432,1


In [43]:
# 예측
pred = df_clf.predict(X_test)

# 예측값
pred



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [44]:
# 실제값 y_test와 비교

# 정확도 확인
from sklearn.metrics import accuracy_score
print('DT 정확도 :', accuracy_score(y_test, pred))

DT 정확도 : 0.6156716417910447


## 다음시간
- 교차검증 진행
- 과적합을 피하기 위해 진행하는 것

----------------------------------

## 필수과제 
### 01
- 교차검증에 대한 개념 정리하여 
- jupyter파일 or 워드파일에 자유롭게 '본인의 언어로' 정리!!
- 노션 권한 부여된 후 2일 안에 업로드할 것

### 02 
- 공유해준 데이터, 데이터에 대한 설명
- 본인이 로직 세워서 전처리하고, 바로 모델에 학습시킬 수 있도록 진행해올 것
- 전처리 ; 모든 문자열 데이터가 수치형 데이터로 정리되어있어야 함
- 수업 전날(토요일)에 본인이 정리한 csv파일과 전처리 로직에 대해 공유할 것
- 카톡으로 요청받은 사람은 반드시 공유 필수