# 성인 인구조사 소득 예측

- age: 나이
- workclass: 고용 형태
- fnlwgt: 사람의 대표성을 나타내는 가중치(final weight)
- education: 교육 수준
- education.num: 교육 수준 수치
- marital.status: 결혼 상태
- occupation: 업종
- relationship: 가족 관계
- race: 인종
- sex: 성별
- capital.gain: 양도 소득
- capital.loss: 양도 손실
- hours.per.week: 주당 근무 시간
- native.country: 국적
- income: 수익 (예측해야 하는 값)

In [114]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/adult-census-income/adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# 사용자 코딩

## 라이브러리 불러오기

In [115]:
import pandas as pd
import numpy as np

x = X_train.copy()
y = y_train.copy()
test = X_test.copy()

print(x.head())
print(y.head())

In [116]:
# EDA
# 데이터 형태, 요약, 이상치

print(x.info())
print(x.describe())
print(x.describe(include = 'object'))
print(x.isnull().sum())

In [117]:
# 결측값 채우고, 삭제

x['workclass'] = x['workclass'].fillna('Private')
test['workclass'] = test['workclass'].fillna('Private')
x['occupation'] = x['occupation'].fillna('Exec-managerial')
test['occupation'] = test['occupation'].fillna('Exec-managerial')

In [118]:
x = x.drop('native.country',1)
test = test.drop('native.country',1)

In [119]:
print(x.isnull().sum())
print(test.isnull().sum())

In [120]:
x.describe(include ='object').columns

In [121]:
x.describe()

In [122]:
numeric_features = [
                    'age',
                    'fnlwgt', 
                    'education.num',
                    'capital.gain', 
                    'capital.loss', 
                    'hours.per.week',                     
                   ]

In [123]:
# 큰 값들 스케일 조정
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x[numeric_features] = scaler.fit_transform(x[numeric_features])
test[numeric_features] = scaler.fit_transform(test[numeric_features])

In [124]:
# 라벨인코딩

from sklearn.preprocessing import LabelEncoder

cols = ['workclass', 'education', 'marital.status', 'occupation','relationship', 'race', 'sex']
for col in cols:
    le = LabelEncoder()
    x[col] = le.fit_transform(x[col])
    test[col] = le.fit_transform(test[col])

In [125]:
y.info()

In [126]:
id = test['id']

In [127]:
y['income'] = (y['income'] != '<=50K').astype(int)

In [128]:
# 머신러닝
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x,y['income'])
pred = model.predict(test)

In [129]:
# csv생성
output = pd.DataFrame({'id': id, 'income':pred})
output.to_csv("000000.csv", index=False)
output.head()

In [130]:
y_test = (y_test['income'] != '<=50K').astype(int)
from sklearn.metrics import accuracy_score
print('accuracy score:', (accuracy_score(y_test, pred)))

In [None]:
accuracy score: 0.8642714570858283
accuracy score: 0.8598188238906802

## 데이터 불러오기(생략)