# 변수 선택 추가 실습

## 1. 필요 라이브러리 import

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

## 2. 데이터셋 준비

In [2]:
data = {'이름': ['홍길동','성춘향','이몽룡','김철수','김영희','김만수','오지랖',
                 '만순이','문정억','백만원','천만원','수십억','현재미' ], 
        '몸무게': [95,85,75,70,65,55,120,100,71,65,75,100,77],
        '키': [170,170,170,170,170,175,171,165,164,177,163,192,182],
        '거주지': ['서울','대구','대전','서울','경북','서울','경북','서울',
                   '서울','전남','전북','경북','서울'],
        '주민번호': ['81XXXX-1XXXXXX','81XXXX-1XXXXXX','79XXXX-2XXXXXX','71XXXX-1XXXXXX',
                     '65XXXX-2XXXXXX','81XXXX-1XXXXXX','63XXXX-1XXXXXX','81XXXX-3XXXXXX',
                     '81XXXX-1234562','55XXXX-1234562','71XXXX-2234562','92XXXX-1234562',
                     '85XXXX-2234562'],
        '흡연여부': ['흡연','미흡연','미흡연','흡연','흡연','미흡연','흡연','흡연','흡연',
                     '미흡연','미흡연','미흡연','미흡연']
       }

df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,이름,몸무게,키,거주지,주민번호,흡연여부
0,홍길동,95,170,서울,81XXXX-1XXXXXX,흡연
1,성춘향,85,170,대구,81XXXX-1XXXXXX,미흡연
2,이몽룡,75,170,대전,79XXXX-2XXXXXX,미흡연
3,김철수,70,170,서울,71XXXX-1XXXXXX,흡연
4,김영희,65,170,경북,65XXXX-2XXXXXX,흡연


In [4]:
df_mod = df.copy()

## 3. 파생변수 생성

### 1) 이름, 흡연 여부(레이블 인코딩 변수 생성)

In [5]:
encoding = LabelEncoder()
encoding.fit(df_mod['이름'])

LabelEncoder()

In [8]:
print(list(encoding.classes_))

['김만수', '김영희', '김철수', '만순이', '문정억', '백만원', '성춘향', '수십억', '오지랖', '이몽룡', '천만원', '현재미', '홍길동']


In [9]:
print(list(encoding.inverse_transform([1,0])))

['김영희', '김만수']


In [10]:
encoding2 = LabelEncoder()
encoding2.fit(df_mod['흡연여부'])

df_mod['흡연여부_인코딩'] = encoding2.transform(df_mod['흡연여부'])

In [11]:
df_mod.head()

Unnamed: 0,이름,몸무게,키,거주지,주민번호,흡연여부,흡연여부_인코딩
0,홍길동,95,170,서울,81XXXX-1XXXXXX,흡연,1
1,성춘향,85,170,대구,81XXXX-1XXXXXX,미흡연,0
2,이몽룡,75,170,대전,79XXXX-2XXXXXX,미흡연,0
3,김철수,70,170,서울,71XXXX-1XXXXXX,흡연,1
4,김영희,65,170,경북,65XXXX-2XXXXXX,흡연,1


### 2) 나이 변수 생성

In [12]:
CurYear = 2022

def stdInfo(rrn):
    if int(rrn[:2]) < 21 and int(rrn[7]) in (3,4):
        biryear = 2000 + int(rrn[:2])
    else:
        biryear = 1900 + int(rrn[:2])
    return CurYear - biryear

In [13]:
df_mod['나이'] = df_mod['주민번호'].apply(stdInfo)

In [14]:
df_mod.head()

Unnamed: 0,이름,몸무게,키,거주지,주민번호,흡연여부,흡연여부_인코딩,나이
0,홍길동,95,170,서울,81XXXX-1XXXXXX,흡연,1,41
1,성춘향,85,170,대구,81XXXX-1XXXXXX,미흡연,0,41
2,이몽룡,75,170,대전,79XXXX-2XXXXXX,미흡연,0,43
3,김철수,70,170,서울,71XXXX-1XXXXXX,흡연,1,51
4,김영희,65,170,경북,65XXXX-2XXXXXX,흡연,1,57


### 3) 성별 변수 생성

In [15]:
def sex(rrn):
    if int(rrn[7]) == 1 or int(rrn[7]) == 3:
        gen = '남'
    else:
        gen = '여'
    return gen

In [17]:
df_mod['성별'] = df_mod['주민번호'].apply(sex)

In [18]:
one_encoding = OneHotEncoder(sparse=False)

In [20]:
df_mod_one = pd.DataFrame(one_encoding.fit_transform(df_mod[['성별']]), columns = ['성별_남','성별_여'])
df_mod = pd.concat([df_mod, df_mod_one], axis = 1)

In [21]:
df_mod.head()

Unnamed: 0,이름,몸무게,키,거주지,주민번호,흡연여부,흡연여부_인코딩,나이,성별,성별_남,성별_여
0,홍길동,95,170,서울,81XXXX-1XXXXXX,흡연,1,41,남,1.0,0.0
1,성춘향,85,170,대구,81XXXX-1XXXXXX,미흡연,0,41,남,1.0,0.0
2,이몽룡,75,170,대전,79XXXX-2XXXXXX,미흡연,0,43,여,0.0,1.0
3,김철수,70,170,서울,71XXXX-1XXXXXX,흡연,1,51,남,1.0,0.0
4,김영희,65,170,경북,65XXXX-2XXXXXX,흡연,1,57,여,0.0,1.0


### 4) 나이 변수의 범주화 변수 생성(연속형 변수의 범주형 변수 변환)

In [22]:
df_mod['나이_범주'] = pd.cut(df_mod.나이, bins = [0,10,20,30,40,50,60,70], labels = [0,10,20,30,40,50,60])

In [23]:
df_mod.head()

Unnamed: 0,이름,몸무게,키,거주지,주민번호,흡연여부,흡연여부_인코딩,나이,성별,성별_남,성별_여,나이_범주
0,홍길동,95,170,서울,81XXXX-1XXXXXX,흡연,1,41,남,1.0,0.0,40
1,성춘향,85,170,대구,81XXXX-1XXXXXX,미흡연,0,41,남,1.0,0.0,40
2,이몽룡,75,170,대전,79XXXX-2XXXXXX,미흡연,0,43,여,0.0,1.0,40
3,김철수,70,170,서울,71XXXX-1XXXXXX,흡연,1,51,남,1.0,0.0,50
4,김영희,65,170,경북,65XXXX-2XXXXXX,흡연,1,57,여,0.0,1.0,50


### 5) BMI 변수 생성

In [24]:
df_mod.eval('BMI = 몸무게 / ((키/100) * (키/100))', inplace=True)

In [25]:
df_mod.head()

Unnamed: 0,이름,몸무게,키,거주지,주민번호,흡연여부,흡연여부_인코딩,나이,성별,성별_남,성별_여,나이_범주,BMI
0,홍길동,95,170,서울,81XXXX-1XXXXXX,흡연,1,41,남,1.0,0.0,40,32.871972
1,성춘향,85,170,대구,81XXXX-1XXXXXX,미흡연,0,41,남,1.0,0.0,40,29.411765
2,이몽룡,75,170,대전,79XXXX-2XXXXXX,미흡연,0,43,여,0.0,1.0,40,25.951557
3,김철수,70,170,서울,71XXXX-1XXXXXX,흡연,1,51,남,1.0,0.0,50,24.221453
4,김영희,65,170,경북,65XXXX-2XXXXXX,흡연,1,57,여,0.0,1.0,50,22.491349


In [26]:
df_mod['BMI분류'] = pd.cut(df_mod.BMI, bins=[0,18,23,25,30,90], labels=['저체중','정상','과체중','비만','고도비만'])
df_mod['BMI분류코드'] = df_mod['BMI분류'].map({'저체중':0, '정상':1, '과체중':2, '비만':3, '고도비만':4})

In [27]:
df_mod.head()

Unnamed: 0,이름,몸무게,키,거주지,주민번호,흡연여부,흡연여부_인코딩,나이,성별,성별_남,성별_여,나이_범주,BMI,BMI분류,BMI분류코드
0,홍길동,95,170,서울,81XXXX-1XXXXXX,흡연,1,41,남,1.0,0.0,40,32.871972,고도비만,4
1,성춘향,85,170,대구,81XXXX-1XXXXXX,미흡연,0,41,남,1.0,0.0,40,29.411765,비만,3
2,이몽룡,75,170,대전,79XXXX-2XXXXXX,미흡연,0,43,여,0.0,1.0,40,25.951557,비만,3
3,김철수,70,170,서울,71XXXX-1XXXXXX,흡연,1,51,남,1.0,0.0,50,24.221453,과체중,2
4,김영희,65,170,경북,65XXXX-2XXXXXX,흡연,1,57,여,0.0,1.0,50,22.491349,정상,1


In [28]:
data_pre = df_mod[['나이_범주', '성별_남', '성별_여', '흡연여부_인코딩', '키', '몸무게', 'BMI분류코드']]
data_pre.head()

Unnamed: 0,나이_범주,성별_남,성별_여,흡연여부_인코딩,키,몸무게,BMI분류코드
0,40,1.0,0.0,1,170,95,4
1,40,1.0,0.0,0,170,85,3
2,40,0.0,1.0,0,170,75,3
3,50,1.0,0.0,1,170,70,2
4,50,0.0,1.0,1,170,65,1
