In [1]:
# Naive Bayes로 iris 데이터를 학습한다.
# feature들이 모두 실숫값이므로 gaussian model을 사용한다.
# ------------------------------------------------------
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# iris data set을 읽어온다
iris = load_iris()

# Train 데이터 세트와 Test 데이터 세트를 구성한다
x_train, x_test, y_train, y_test = \
    train_test_split(iris.data, iris.target, test_size = 0.2)

# Gaussian model로 Train 데이터 세트를 학습한다.
model = GaussianNB()
model.fit(x_train, y_train)

print("\n* Gaussian model :")
print("* 학습용 데이터로 측정한 정확도 = %.2f" % model.score(x_train, y_train))
print("* 시험용 데이터로 측정한 정확도 = %.2f" % model.score(x_test, y_test))



* Gaussian model :
* 학습용 데이터로 측정한 정확도 = 0.97
* 시험용 데이터로 측정한 정확도 = 0.90


In [4]:
# Naive Bayes 분류기로 income 데이터 세트를 학습한다.
# categorical과 gaussian feature가 섞여 있는 경우, 각 feature를 분리하여 MultinomialNB와
# GaussianNB로 나눠서 학습하고 추정 확률을 곱한 값으로 시험 데이터의 label을 추정한다.
# ------------------------------------------------------------------------------------
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# income 데이터를 읽어온다
# https://www.kaggle.com/wenruliu/adult-income-dataset?select=adult.csv
DATA_PATH = 'C:\\Users\\배진우\\Documents\\multiCampus_TA\\python_data\\'
income = pd.read_csv(DATA_PATH + 'adult.csv')
income.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [7]:
print(income.columns)
# 1, 3,5,6,7,8,9,13
print(len(income.columns))

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')
15


In [10]:
# categorical feature를 숫자로 변환한다.
cat_features = ["workclass","education", "marital-status", "occupation", "relationship", 
                "race","gender","native-country", "income"]

for c in cat_features:
    income[c] = pd.Categorical(income[c]).codes

# Train 데이터 세트와 Test 데이터 세트를 구성한다
x = np.array(income)[:, :-1]
y = np.array(income)[:, -1]

In [11]:
income

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39,0
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39,0
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39,1
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,39,1
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,4,257302,7,12,2,13,5,4,0,0,0,38,39,0
48838,40,4,154374,11,9,2,7,0,4,1,0,0,40,39,1
48839,58,4,151910,11,9,6,1,4,4,0,0,0,40,39,0
48840,22,4,201490,11,9,4,1,3,4,1,0,0,20,39,0


In [42]:
pd.isnull(income)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48838,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48839,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48840,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# categorical feature를 multinomial naive bayes로 학습한다.
# --------------------------------------------------------
cat_n = [1, 3, 5, 6, 7, 8, 9, 13]
catx_train = x_train[:, cat_n]
catx_test = x_test[:, cat_n]

# Multinomial model로 categorical Train 데이터 세트를 학습한다.
model_m = MultinomialNB(alpha=1.0)  # alpha = 1.0 : Laplace smoothing (default)
model_m.fit(catx_train, y_train)

MultinomialNB()

In [14]:
# gaussian feature를 gaussian naive bayes로 학습한다.
# --------------------------------------------------
gau_n = [0, 2, 4, 10, 11, 12]
gaux_train = x_train[:, gau_n]
gaux_test = x_test[:, gau_n]

# Gaussian model로 gaussian Train 데이터 세트를 학습한다.
model_g = GaussianNB()
model_g.fit(gaux_train, y_train)

GaussianNB()

In [15]:
# 시험 데이터를 이용하여 정확도를 측정한다. 시험데이터도 categorical과 gaussian으로
# 분리돼 있다. 각각의 모형에 따라 확률을 추정한다.
cat_prob = model_m.predict_proba(catx_test)
gau_prob = model_g.predict_proba(gaux_test)

# 두 확률을 곱한다.
mix_prob = np.multiply(cat_prob, gau_prob)

# 두 확률의 곱으로 정확도를 측정한다.
mix_label = np.argmax(mix_prob, 1)
accuracy = (y_test == mix_label).mean()

print("\n* 시험용 데이터로 측정한 정확도 = %.2f" % accuracy)


* 시험용 데이터로 측정한 정확도 = 0.80


In [44]:
cat_train_prob = model_m.predict_proba(catx_train)
gau_train_prob = model_g.predict_proba(gaux_train)

cat_test_prob = model_m.predict_proba(catx_test)
gau_test_prob = model_g.predict_proba(gaux_test)


print(cat_prob[:,[0]])
print(cat_prob[:,[0]].shape)

[[0.66535054]
 [0.90381109]
 [0.84308479]
 ...
 [0.66761381]
 [0.59205989]
 [0.35563127]]
(9769, 1)


In [45]:
print(gau_prob[:,[0]])
print(gau_prob[:,[0]].shape)

[[0.98327584]
 [0.9989211 ]
 [0.99535435]
 ...
 [0.9875966 ]
 [0.98264692]
 [0.99543396]]
(9769, 1)


In [57]:
train_dt = np.concatenate((cat_train_prob[:,[0]],gau_train_prob[:,[0]]), axis = 1 )
print("train_dt : \n", train_dt)

test_dt = np.concatenate((cat_test_prob[:,[0]],gau_test_prob[:,[0]]), axis = 1 )
print("test_dt : \n", test_dt)

train_dt : 
 [[5.65879273e-01 9.88958264e-01]
 [9.58066599e-01 9.98067895e-01]
 [4.27372407e-01 1.10051372e-11]
 ...
 [5.07426512e-01 9.89209182e-01]
 [7.82854855e-01 9.93990399e-01]
 [5.65879273e-01 3.32172939e-03]]
test_dt : 
 [[0.66535054 0.98327584]
 [0.90381109 0.9989211 ]
 [0.84308479 0.99535435]
 ...
 [0.66761381 0.9875966 ]
 [0.59205989 0.98264692]
 [0.35563127 0.99543396]]


In [50]:
train_dt = pd.DataFrame(train_dt, columns=['cat_prob','gau_prob'])
train_dt

Unnamed: 0,cat_prob,gau_prob
0,0.565879,9.889583e-01
1,0.958067,9.980679e-01
2,0.427372,1.100514e-11
3,0.860021,9.982731e-01
4,0.427372,9.799839e-01
...,...,...
39068,0.979338,9.830484e-01
39069,0.485696,2.587475e-01
39070,0.507427,9.892092e-01
39071,0.782855,9.939904e-01


In [60]:
model_g2 = GaussianNB()
model_g2.fit(train_dt, y_train)

model_g2.score(test_dt, y_test)

0.7934281912171154

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression

pipe = Pipeline(steps=[('model', SVR())])  # 임의 model로 pip를 초기화한다.
grid_params = [{'model':[SVR()],
                'model__kernel':['rbf'],
                'model__C':[0.1, 1.0, 10.0],
                'model__epsilon':[0.1, 1.0, 2.0]},
               
               {'model':[DecisionTreeRegressor()],
                'model__max_depth':[5, 10]},
               
                {'model':[LogisticRegression()],
                'model__penalty' : ['l2'],
                 'model__max_iter' : [100, 200]
               },
              ]

grid = GridSearchCV(estimator=pipe, param_grid=grid_params, cv=5, refit=True)
grid.fit(train_dt, y_train)

best_model = grid.best_estimator_

print("Best parameter = ", grid.best_params_)
print("Best test score = ", best_model.score(test_dt, y_test))

Best parameter =  {'model': LogisticRegression(), 'model__max_iter': 100, 'model__penalty': 'l2'}
Best test score =  0.794451837444979
