In [22]:
from sklearn import datasets

data_breast_cancer = datasets.load_breast_cancer()

In [23]:
from sklearn.model_selection import train_test_split

X = data_breast_cancer.data[:, [1, 8]] 
y = data_breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [24]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
log_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()

voting_clf_hard = VotingClassifier(estimators=[('tr', tree_clf),('lr', log_clf), ('knn', knn_clf)], voting='hard')

voting_clf_hard.fit(X_train, y_train)
y_pred = voting_clf_hard.predict(X_test)
accuracy_hard = accuracy_score(y_test, y_pred)
print(accuracy_hard)

0.6929824561403509


In [25]:
voting_clf_soft = VotingClassifier(estimators=[('tr', tree_clf),('lr', log_clf), ('knn', knn_clf)], voting='soft')

voting_clf_soft.fit(X_train, y_train)
y_pred = voting_clf_soft.predict(X_test)
accuracy_soft = accuracy_score(y_test, y_pred)
print(accuracy_soft)

0.6754385964912281


In [26]:
result = []
for clf in [tree_clf, log_clf, knn_clf, voting_clf_hard, voting_clf_soft]:
    clf.fit(X_train, y_train)
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    result.append((train_acc, test_acc))

print(result)

[(1.0, 0.6140350877192983), (0.7230769230769231, 0.7017543859649122), (0.7714285714285715, 0.6403508771929824), (0.8351648351648352, 0.6929824561403509), (0.9648351648351648, 0.6666666666666666)]


In [27]:
import pickle

with open("acc_vote.pkl", "wb") as f:
    pickle.dump(result, f)

In [28]:
vote = [tree_clf,log_clf,knn_clf,voting_clf_hard,voting_clf_soft]

with open("vote.pkl", "wb") as f:
    pickle.dump(vote, f)

In [29]:
results2 = []

In [30]:
data = datasets.load_breast_cancer()

# X = data.data
# y = data.target
# 
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, random_state=42)
bag_clf.fit(X_train, y_train)
train_acc = bag_clf.score(X_train, y_train)
test_acc = bag_clf.score(X_test, y_test)
results2.append((train_acc, test_acc))

In [31]:
bag_clf_50 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, random_state=42)
bag_clf_50.fit(X_train, y_train)
train_acc = bag_clf_50.score(X_train, y_train)
test_acc = bag_clf_50.score(X_test, y_test)
results2.append((train_acc, test_acc))

In [32]:
pas_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap= False, random_state=42)
pas_clf.fit(X_train, y_train)
train_acc = pas_clf.score(X_train, y_train)
test_acc = pas_clf.score(X_test, y_test)
results2.append((train_acc, test_acc))

In [33]:
pas_clf_50 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap= False, max_samples=0.5, random_state=42)
pas_clf_50.fit(X_train, y_train)
train_acc = pas_clf_50.score(X_train, y_train)
test_acc = pas_clf_50.score(X_test, y_test)
results2.append((train_acc, test_acc))

In [34]:
rnd_clf = RandomForestClassifier(n_estimators=30, random_state=42)
rnd_clf.fit(X_train, y_train)
train_acc = rnd_clf.score(X_train, y_train)
test_acc = rnd_clf.score(X_test, y_test)
results2.append((train_acc, test_acc))

In [35]:
ada_clf = AdaBoostClassifier(n_estimators=30, random_state=42)
ada_clf.fit(X_train, y_train)
train_acc = ada_clf.score(X_train, y_train)
test_acc = ada_clf.score(X_test, y_test)
results2.append((train_acc, test_acc))



In [36]:
gbrt = GradientBoostingClassifier(n_estimators=30, random_state=42)
gbrt.fit(X_train, y_train)
train_acc = gbrt.score(X_train, y_train)
test_acc = gbrt.score(X_test, y_test)
results2.append((train_acc, test_acc))

In [37]:
with open("acc_bag.pkl", "wb") as f:
    pickle.dump(results2, f)

In [38]:
classifiers = [bag_clf, bag_clf_50, pas_clf, pas_clf_50, rnd_clf, ada_clf, gbrt]
with open ("bag.pkl", "wb") as f:
    pickle.dump(classifiers, f)

In [39]:
import numpy as np
X = data_breast_cancer.data 
y = data_breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
selected_features = np.random.choice(range(X_train.shape[1]), size=2, replace=False)
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

bagging_clf = BaggingClassifier(n_estimators=30, max_samples=0.5, random_state=42)
bagging_clf.fit(X_train_selected, y_train)
train_acc = bagging_clf.score(X_train_selected, y_train)
test_acc = bagging_clf.score(X_test_selected, y_test)

results3 = []
results3.append(train_acc)
results3.append(test_acc)
print(results3)

[0.9472527472527472, 0.8421052631578947]


In [40]:
with open("acc_fea.pkl", "wb") as f:
    pickle.dump(results3, f)

In [41]:
with open("fea.pkl", "wb") as f:
    pickle.dump([bagging_clf], f)

In [42]:
import pandas as pd

results = []
selected_features = np.random.choice(range(X_train.shape[1]), size=2, replace=False)
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

for i, estimator in enumerate(bagging_clf.estimators_):
    train_accuracy = estimator.score(X_train_selected, y_train)
    
    test_accuracy = estimator.score(X_test_selected, y_test)
    
    selected_features = [data.feature_names[f] for f in bagging_clf.estimators_features_[i]]
    
    results.append((train_accuracy, test_accuracy, selected_features))

df_results = pd.DataFrame(results, columns=['Dokładność dla zb. uczącego', 'Dokładność dla zb. testującego', 'Wybrane cechy'])

df_results_sorted = df_results.sort_values(by=['Dokładność dla zb. testującego', 'Dokładność dla zb. uczącego'], ascending=False)

with open('acc_fea_rank.pkl', 'wb') as f:
    pickle.dump(df_results_sorted, f)
