Podziel zbiór data_breast_cancer na uczący i testujący w proporcjach 80:20

In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

data_breast_cancer = load_breast_cancer(as_frame=True)

Dzielimy na zbiór uczący i testowy (pod uwagę bierzemy cechy 'mean texture', 'mean symmetry' )

In [64]:
X_train, X_test, y_train, y_test = train_test_split(data_breast_cancer['data'][['mean texture', 'mean symmetry']],
                                                    data_breast_cancer['target'],
                                                    test_size=.2)

In [65]:
from sklearn.tree import DecisionTreeClassifier

dec_tree_clf = DecisionTreeClassifier()
dec_tree_clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [66]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

In [67]:
from sklearn.neighbors import KNeighborsClassifier

knn_reg = KNeighborsClassifier()
knn_reg.fit(X_train, y_train)

KNeighborsClassifier()

In [68]:
from sklearn.ensemble import VotingClassifier

voting_hard_clf = VotingClassifier(
    estimators=[
        ('dec_tree', dec_tree_clf),
        ('log_reg', log_reg),
        ('knn_clf', knn_reg)],
    voting='hard'
)

voting_hard_clf.fit(X_train, y_train)

voting_soft_clf = VotingClassifier(
    estimators=[
        ('dec_tree', dec_tree_clf),
        ('log_reg', log_reg),
        ('knn_clf', knn_reg)],
    voting='soft'
)
voting_soft_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('dec_tree', DecisionTreeClassifier()),
                             ('log_reg', LogisticRegression()),
                             ('knn_clf', KNeighborsClassifier())],
                 voting='soft')

In [69]:
acc_vote = [(dec_tree_clf.score(X_train, y_train), dec_tree_clf.score(X_test, y_test)),
            (log_reg.score(X_train, y_train), log_reg.score(X_test, y_test)),
            (knn_reg.score(X_train, y_train), knn_reg.score(X_test, y_test)),
            (voting_hard_clf.score(X_train, y_train), voting_hard_clf.score(X_test, y_test)),
            (voting_soft_clf.score(X_train, y_train), voting_soft_clf.score(X_test, y_test))]

piklowanie acc_vote

In [70]:
import pickle

with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(acc_vote, f)

In [71]:
with open('acc_vote.pkl', 'rb') as f:
    print(pickle.load(f))

[(1.0, 0.6842105263157895), (0.7186813186813187, 0.7280701754385965), (0.7692307692307693, 0.6754385964912281), (0.832967032967033, 0.7105263157894737), (0.9648351648351648, 0.6929824561403509)]


piklowanie klasyfikatorów

In [72]:
vote = [dec_tree_clf,
        log_reg,
        knn_reg,
        voting_hard_clf,
        voting_soft_clf]

with open('vote.pkl', 'wb') as f:
    pickle.dump(vote, f)

In [73]:
with open('vote.pkl', 'rb') as f:
    print(pickle.load(f))

[DecisionTreeClassifier(), LogisticRegression(), KNeighborsClassifier(), VotingClassifier(estimators=[('dec_tree', DecisionTreeClassifier()),
                             ('log_reg', LogisticRegression()),
                             ('knn_clf', KNeighborsClassifier())]), VotingClassifier(estimators=[('dec_tree', DecisionTreeClassifier()),
                             ('log_reg', LogisticRegression()),
                             ('knn_clf', KNeighborsClassifier())],
                 voting='soft')]


Wykonaj na zbiorze uczącym wykorzystując 30 drzew decyzyjnych

In [74]:
from sklearn.ensemble import BaggingClassifier

bagging_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators=30)
bagging_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30)

In [75]:
bagging_50_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                   n_estimators=30,
                                   max_samples=0.5)
bagging_50_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30)

In [76]:
pasting_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators=30,
                                bootstrap=False)
pasting_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  n_estimators=30)

In [77]:
pasting_50_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                   n_estimators=30,
                                   max_samples=0.5,
                                   bootstrap=False)
pasting_50_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.5, n_estimators=30)

In [78]:
from sklearn.ensemble import RandomForestClassifier

ran_for_clf = RandomForestClassifier(n_estimators=30)
ran_for_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=30)

In [79]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(n_estimators=30)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=30)

In [80]:
from sklearn.ensemble import GradientBoostingClassifier

grad_clf = GradientBoostingClassifier(n_estimators=30)
grad_clf.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=30)

In [81]:
acc_bag = [(bagging_clf.score(X_train, y_train), bagging_clf.score(X_test, y_test)),
           (bagging_50_clf.score(X_train, y_train), bagging_50_clf.score(X_test, y_test)),
           (pasting_clf.score(X_train, y_train), pasting_clf.score(X_test, y_test)),
           (pasting_50_clf.score(X_train, y_train), pasting_50_clf.score(X_test, y_test)),
           (ran_for_clf.score(X_train, y_train), ran_for_clf.score(X_test, y_test)),
           (ada_clf.score(X_train, y_train), ada_clf.score(X_test, y_test)),
           (grad_clf.score(X_train, y_train), grad_clf.score(X_test, y_test))]

acc_bag

[(0.9978021978021978, 0.7192982456140351),
 (0.9362637362637363, 0.7456140350877193),
 (1.0, 0.6754385964912281),
 (0.9560439560439561, 0.7456140350877193),
 (0.9978021978021978, 0.7543859649122807),
 (0.7978021978021979, 0.7456140350877193),
 (0.8307692307692308, 0.7719298245614035)]

In [82]:
with open("acc_bag.pkl", 'wb') as f:
    pickle.dump(acc_bag,f)
with open("acc_bag.pkl", 'rb') as f:
    print(pickle.load(f))

[(0.9978021978021978, 0.7192982456140351), (0.9362637362637363, 0.7456140350877193), (1.0, 0.6754385964912281), (0.9560439560439561, 0.7456140350877193), (0.9978021978021978, 0.7543859649122807), (0.7978021978021979, 0.7456140350877193), (0.8307692307692308, 0.7719298245614035)]


In [83]:
bag = [bagging_clf, bagging_50_clf, pasting_clf, pasting_50_clf, ran_for_clf, ada_clf, grad_clf]
bag

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                   n_estimators=30),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                   n_estimators=30),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                   max_samples=0.5, n_estimators=30),
 RandomForestClassifier(n_estimators=30),
 AdaBoostClassifier(n_estimators=30),
 GradientBoostingClassifier(n_estimators=30)]

In [84]:
with open('bag.pkl', 'wb') as f:
    pickle.dump(bag, f)

with open('bag.pkl', 'rb') as f:
    print(pickle.load(f))

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


Przeprowadź sampling 2 cech z wszystkich dostepnych bez powtórzeń z wykorzystaniem 30 drzew decyzyjnych, wybierz połowę instancji dla każdego z drzew z powtórzeniami.

In [85]:
X_train, X_test, y_train, y_test = train_test_split(data_breast_cancer['data'],
                                                    data_breast_cancer['target'], test_size=.2, random_state=5)

In [86]:
bag_2_features = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30, max_features=2,
                                   bootstrap_features=False,
                                   bootstrap=True, max_samples=.5,random_state=25)
bag_2_features.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=2,
                  max_samples=0.5, n_estimators=30, random_state=25)

Zapisz dokładności ww estymatora listę : dokładność_dla_zb_uczącego, dokładność_dla_zb_testującego w pliku Pickle acc_fea.pkl.


In [87]:
acc_fea = [bag_2_features.score(X_train, y_train), bag_2_features.score(X_test, y_test)]
acc_fea

[0.9956043956043956, 0.9473684210526315]

In [88]:
with open('acc_fea.pkl', 'wb') as f:
    pickle.dump(acc_fea, f)

with open('acc_fea.pkl', 'rb') as f:
    print(pickle.load(f))

[0.9956043956043956, 0.9473684210526315]


Zapisz klasyfikator jako jednoelementową listę w pliku Pickle o nazwie fea.pkl

In [89]:
with open('fea.pkl', 'wb') as f:
    pickle.dump(bag_2_features, f)

with open('fea.pkl', 'rb') as f:
    print(pickle.load(f))

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=2,
                  max_samples=0.5, n_estimators=30, random_state=25)


Sprawdź, które cechy dają najwięszą dokładność. Dostęp do poszczególnych estymatorów,
aby obliczyć dokładność, możesz uzyskać za pmocą: BaggingClasifier.estimators_,
cechy wybrane przez sampling dla każdego z estymatorów znajdziesz w:
BaggingClassifier.estimators_features_. Zbuduj ranking estymatorów jako DataFrame,
który będzie mieć w kolejnych kolumnach: dokładność dla zb. uczącego, dokładnośc dla zb.
testującego, lista nazw cech. Każdy wiersz to informacje o jednym estymatorze. DataFrame
posortuj malejąco po wartościach dokładności dla zbioru testującego i uczącego oraz zapisz
w pliku Pickle o nazwie acc_fea_rank.pkl

In [90]:
features_array = []
for features in bag_2_features.estimators_features_:
    features_array.append([data_breast_cancer["feature_names"][features[0]],
                           data_breast_cancer["feature_names"][features[1]]])

In [91]:
acc_fea_rank = []
for estimator, features in zip(bag_2_features.estimators_, features_array):
    acc_fea_rank.append(
        [estimator.score(X_train[features], y_train), estimator.score(X_test[features], y_test), features])



In [92]:
df_rank = pd.DataFrame(acc_fea_rank)
df_rank.sort_values(inplace=True, by=[0, 1], ascending=False)
df_rank

Unnamed: 0,0,1,2
6,0.96044,0.95614,"[worst area, worst smoothness]"
19,0.956044,0.894737,"[worst concave points, mean perimeter]"
17,0.947253,0.912281,"[mean perimeter, worst smoothness]"
14,0.931868,0.912281,"[worst perimeter, mean perimeter]"
8,0.931868,0.859649,"[perimeter error, mean concave points]"
28,0.92967,0.912281,"[area error, worst perimeter]"
25,0.925275,0.885965,"[area error, worst compactness]"
4,0.923077,0.912281,"[mean concave points, worst symmetry]"
10,0.923077,0.894737,"[worst area, mean smoothness]"
9,0.920879,0.824561,"[worst smoothness, mean concave points]"


In [93]:
df_rank.to_pickle("acc_fea_rank.pkl")