## 성인 인구조사 소득 예측
- age: 나이
- workclass: 고용 형태
- fnlwgt: 사람의 대표성을 나타내는 가중치(final weight)
- education: 교육 수준
- education.num: 교육 수준 수치
- marital.status: 결혼 상태
- occupation: 업종
- relationship: 가족 관계
- race: 인종
- sex: 성별
- capital.gain: 양도 소득
- capital.loss: 양도 손실
- hours.per.week: 주당 근무 시간
- native.country: 국적
- income: 수익 (예측해야 하는 값)

In [1]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../../dataset/adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048, 2), (6513, 2))

##### 라이브러리 불러오기

In [2]:
import pandas as pd
import numpy as np

##### 데이터 불러오기(생략)

In [3]:
# 시험환경에서는 아래와 같이 제공된다고 함
# import pandas as pd
# X_test = pd.read_csv("data/X_test.csv")
# X_train = pd.read_csv("data/X_train.csv")
# y_train = pd.read_csv("data/y_train.csv")

##### EDA

In [4]:
# 데이터 크기 확인
X_train.shape, X_test.shape, y_train.shape

((26048, 15), (6513, 15), (26048, 2))

In [5]:
# 데이터 확인
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
7632,7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States
27878,27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States
14121,14121,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States
32345,32345,54,State-gov,138852,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States


In [6]:
# 타겟 수 확인
y_train['income'].value_counts()

<=50K    19756
>50K      6292
Name: income, dtype: int64

In [7]:
# type확인
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 21851 to 25716
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26048 non-null  int64 
 1   age             26048 non-null  int64 
 2   workclass       24592 non-null  object
 3   fnlwgt          26048 non-null  int64 
 4   education       26048 non-null  object
 5   education.num   26048 non-null  int64 
 6   marital.status  26048 non-null  object
 7   occupation      24585 non-null  object
 8   relationship    26048 non-null  object
 9   race            26048 non-null  object
 10  sex             26048 non-null  object
 11  capital.gain    26048 non-null  int64 
 12  capital.loss    26048 non-null  int64 
 13  hours.per.week  26048 non-null  int64 
 14  native.country  25587 non-null  object
dtypes: int64(7), object(8)
memory usage: 3.2+ MB


In [8]:
# 피처 구분
# Numeric features
numeric_features = X_train.select_dtypes(exclude='object').columns
# Categorical features
cat_features = X_train.select_dtypes(include='object').columns
numeric_features, cat_features

(Index(['id', 'age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss',
        'hours.per.week'],
       dtype='object'),
 Index(['workclass', 'education', 'marital.status', 'occupation',
        'relationship', 'race', 'sex', 'native.country'],
       dtype='object'))

In [9]:
numeric_features = numeric_features[1:]
numeric_features

Index(['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss',
       'hours.per.week'],
      dtype='object')

In [10]:
X_train[numeric_features].describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,26048.0,26048.0,26048.0,26048.0,26048.0,26048.0
mean,38.610335,189574.1,10.082118,1081.193796,88.477695,40.420224
std,13.628346,104384.8,2.574608,7404.962675,404.689981,12.354707
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,118247.2,9.0,0.0,0.0,40.0
50%,37.0,178575.5,10.0,0.0,0.0,40.0
75%,48.0,236596.8,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [11]:
X_train[cat_features].describe()

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country
count,24592,26048,26048,24585,26048,26048,26048,25587
unique,8,16,7,14,6,5,2,41
top,Private,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
freq,18160,8408,11987,3323,10558,22270,17400,23381


##### 결측치 처리

In [12]:
X_train.isnull().sum()

id                   0
age                  0
workclass         1456
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1463
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     461
dtype: int64

In [13]:
X_test.isnull().sum()

id                  0
age                 0
workclass         380
fnlwgt              0
education           0
education.num       0
marital.status      0
occupation        380
relationship        0
race                0
sex                 0
capital.gain        0
capital.loss        0
hours.per.week      0
native.country    122
dtype: int64

In [14]:
X_train['workclass'].value_counts()

Private             18160
Self-emp-not-inc     2049
Local-gov            1648
State-gov            1037
Self-emp-inc          909
Federal-gov           770
Without-pay            12
Never-worked            7
Name: workclass, dtype: int64

In [15]:
X_train['occupation'].value_counts()

Exec-managerial      3323
Prof-specialty       3306
Craft-repair         3296
Adm-clerical         3037
Sales                2898
Other-service        2624
Machine-op-inspct    1584
Transport-moving     1257
Handlers-cleaners    1080
Farming-fishing       786
Tech-support          746
Protective-serv       521
Priv-house-serv       119
Armed-Forces            8
Name: occupation, dtype: int64

In [16]:
X_train['native.country'].value_counts()

United-States                 23381
Mexico                          516
Philippines                     158
Germany                         108
Canada                           88
Puerto-Rico                      87
El-Salvador                      76
India                            73
Cuba                             73
England                          69
Italy                            63
South                            62
Jamaica                          59
Vietnam                          57
China                            57
Guatemala                        54
Dominican-Republic               51
Japan                            49
Poland                           47
Columbia                         44
Taiwan                           37
Haiti                            37
Iran                             34
Portugal                         32
Peru                             29
Nicaragua                        27
Ecuador                          25
Greece                      

- 결측치는 최빈값과 차이가 크면 최빈값으로 값이 비슷하면 별도의 값으로 대체함

In [17]:
X_train['workclass'].mode()

0    Private
Name: workclass, dtype: object

In [18]:
def data_fillna(df):
    df['workclass'] = df['workclass'].fillna(df['workclass'].mode()[0])
    df['occupation'] = df['occupation'].fillna("null")
    df['native.country'] = df['native.country'].fillna(df['native.country'].mode()[0])
    return df

X_train = data_fillna(X_train)
X_test = data_fillna(X_test)

X_train.isnull().sum()

id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

##### 피처엔지니어링

In [19]:
from sklearn.preprocessing import LabelEncoder

all_df = pd.concat([X_train.assign(ind='train'), X_test.assign(ind='test')])
le = LabelEncoder()
all_df[cat_features] = all_df[cat_features].apply(le.fit_transform)

X_train = all_df[all_df['ind']=='train']
X_train = X_train.drop('ind', axis=1)
X_test = all_df[all_df['ind']=='test']
X_test = X_test.drop('ind', axis=1)

In [20]:
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,3,241998,9,13,2,2,0,4,1,0,0,50,38
7632,7632,53,3,103950,12,14,0,9,1,4,0,0,0,40,38
27878,27878,19,3,203061,15,10,4,12,1,4,0,0,0,25,38
14121,14121,20,3,102607,11,9,4,5,3,4,1,0,0,30,38
32345,32345,54,6,138852,11,9,2,9,0,4,1,0,0,40,38


In [21]:
# 스케일링
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,0.260274,3,0.156011,9,0.8,2,2,0,4,1,0.0,0.0,0.5,38
7632,7632,0.493151,3,0.062255,12,0.866667,0,9,1,4,0,0.0,0.0,0.397959,38
27878,27878,0.027397,3,0.129566,15,0.6,4,12,1,4,0,0.0,0.0,0.244898,38
14121,14121,0.041096,3,0.061343,11,0.533333,4,5,3,4,1,0.0,0.0,0.295918,38
32345,32345,0.506849,6,0.085958,11,0.533333,2,9,0,4,1,0.0,0.0,0.397959,38


In [22]:
# target값 변경
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 21851 to 25716
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      26048 non-null  int64 
 1   income  26048 non-null  object
dtypes: int64(1), object(1)
memory usage: 610.5+ KB


In [23]:
y_train.head()

Unnamed: 0,id,income
21851,21851,>50K
7632,7632,<=50K
27878,27878,<=50K
14121,14121,<=50K
32345,32345,<=50K


In [24]:
y = (y_train['income'] != '<=50K').astype(int)
y.head()

21851    1
7632     0
27878    0
14121    0
32345    0
Name: income, dtype: int64

##### 검증용 데이터 분리

In [25]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.15, random_state=2022)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((22140, 15), (3908, 15), (22140,), (3908,))

In [26]:
X_tr.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
1956,1956,0.342466,3,0.163155,14,0.933333,2,3,0,4,1,0.150242,0.0,0.397959,38
21954,21954,0.410959,3,0.133744,6,0.266667,2,6,0,2,1,0.0,0.0,0.561224,38
7029,7029,0.232877,3,0.030289,15,0.6,4,7,4,4,0,0.0,0.0,0.397959,38
17468,17468,0.136986,3,0.132386,11,0.533333,4,5,3,4,1,0.0,0.0,0.397959,38
19800,19800,0.69863,3,0.04176,11,0.533333,6,11,1,4,0,0.0,0.0,0.234694,38


In [27]:
# id 삭제
X_tr = X_tr.drop('id', axis=1)
X_val = X_val.drop('id', axis=1)
# id 삭제된 것 확인
X_tr.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
1956,0.342466,3,0.163155,14,0.933333,2,3,0,4,1,0.150242,0.0,0.397959,38


##### 모델 & 평가

In [28]:
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(random_state = 2022)
model.fit(X_tr, y_tr)
pred = model.predict(X_val)
print('accuracy score:', (accuracy_score(y_val, pred)))

accuracy score: 0.8039918116683725


In [29]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state = 2022)
model.fit(X_tr, y_tr)
pred = model.predict(X_val)
print('accuracy score:', (accuracy_score(y_val, pred)))

accuracy score: 0.8508188331627431


In [30]:
# test데이터 예측 (pop을 활용하면 값을 넘겨주고 삭제 됨)
X_test_id = X_test.pop('id')
pred = model.predict(X_test)

In [31]:
# csv생성
output = pd.DataFrame({'id': X_test_id, 'income':pred})
output.to_csv("audlt-census-income.csv", index=False)
output.head()

Unnamed: 0,id,income
20901,20901,1
14170,14170,0
1776,1776,1
30428,30428,0
8602,8602,0


## 채점 (수험자는 확인 불가)

In [32]:
y_test = (y_test['income'] != '<=50K').astype(int)
from sklearn.metrics import accuracy_score
print('accuracy score:', (accuracy_score(y_test, pred)))

accuracy score: 0.8575157377552587
