In [1]:
import requests, zipfile
import io
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

import sklearn

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
res = requests.get(url).content

mushroom = pd.read_csv(io.StringIO(res.decode('utf-8')), header = None)

mushroom.columns = ['classes', 'cap_shape','cap_surface','cap_color','odor','bruises','gill_attachment','gill_spacing','gill_size','gill_color','stalk_shape','stalk_root',
                                     'stalk_surface_above_ring','stalk_surface_below_ring',
                                 'stalck_color_above_ring','stalk_color_below_ring','veil_type','veil_color','ring_number','ring_type','spore_print_color','population','habitat']

mushroom

Unnamed: 0,classes,cap_shape,cap_surface,cap_color,odor,bruises,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalck_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

mushroom_dummy = pd.get_dummies(mushroom[['gill_color','gill_attachment','odor','cap_color']])
mushroom_dummy['flg'] = mushroom['classes'].map(lambda x : 1 if x =='p' else 0)
X = mushroom_dummy.drop('flg', axis=1)
Y = mushroom_dummy['flg']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.5, random_state=12)

model = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=12)
model.fit(X_train, Y_train)

print('정확도(train) : {:.3f}'.format(model.score(X_train,Y_train)))
print('정확도(test) : {:.3f}'.format(model.score(X_test, Y_test)))

정확도(train) : 0.914
정확도(test) : 0.907


In [3]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

X = cancer['data']
Y = cancer['target']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.5, random_state=12)

model = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=12)
model.fit(X_train,Y_train)

print('정확도(train) : {:.3f}'.format(model.score(X_train,Y_train)))
print('정확도(test) : {:.3f}'.format(model.score(X_test,Y_test)))

정확도(train) : 1.000
정확도(test) : 0.937


In [5]:
from sklearn.datasets import load_iris

iris = load_iris()

X_train,X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, stratify=iris.target, random_state=12)

model = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=12)
model.fit(X_train, Y_train)

print('정확도(train) : {:.3f}'.format(model.score(X_train,Y_train)))
print('정확도(test) : {:.3f}'.format(model.score(X_test,Y_test)))

정확도(train) : 1.000
정확도(test) : 0.947
