# Dataset

In [155]:
import pandas as pd
import scipy as sp
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn import svm
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
import matplotlib as plt

In [156]:
from sklearn.datasets import load_breast_cancer

#Preparing data
breast_cancer = load_breast_cancer()
data_play_tennis  = pd.read_csv('play_tennis.csv')

#convert breast cancer data to pandas dataframe
data_breast_cancer = pd.DataFrame(breast_cancer['data'], columns=breast_cancer['feature_names'])
data_breast_cancer['target'] = breast_cancer['target']

In [157]:
#Breast Cancer Data Head
data_breast_cancer.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [158]:
# Play Tennis Data Head
from sklearn.preprocessing import LabelEncoder
Le = LabelEncoder()

data_play_tennis = data_play_tennis.drop(columns=['day'])
data_play_tennis['outlook'] = Le.fit_transform(data_play_tennis['outlook'])
data_play_tennis['temp'] = Le.fit_transform(data_play_tennis['temp'])
data_play_tennis['humidity'] = Le.fit_transform(data_play_tennis['humidity'])
data_play_tennis['wind'] = Le.fit_transform(data_play_tennis['wind'])
data_play_tennis['play'] = Le.fit_transform(data_play_tennis['play'])
data_play_tennis.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,2,1,0,1,0
1,2,1,0,0,0
2,0,1,0,1,1
3,1,2,0,1,1
4,1,0,1,1,1


# Split to Training and Test Data

In [159]:
#Split breast cancer to training and testing data
y1 = data_breast_cancer.target
x1 = data_breast_cancer.drop('target', axis = 1)

x1_train,x1_test,y1_train,y1_test=train_test_split(x1,y1,test_size=0.2)

print("shape of original dataset :", data_breast_cancer.shape)
print("shape of input - training set", x1_train.shape)
print("shape of output - training set", y1_train.shape)
print("shape of input - testing set", x1_test.shape)
print("shape of output - testing set", y1_test.shape)

shape of original dataset : (569, 31)
shape of input - training set (455, 30)
shape of output - training set (455,)
shape of input - testing set (114, 30)
shape of output - testing set (114,)


In [160]:
#Split play tennis to training and testing data
y2 = data_play_tennis.play
x2 = data_play_tennis.drop('play', axis = 1)

x2_train,x2_test,y2_train,y2_test=train_test_split(x2,y2,test_size=0.2)

print("shape of original dataset :", data_play_tennis.shape)
print("shape of input - training set", x2_train.shape)
print("shape of output - training set", y2_train.shape)
print("shape of input - testing set", x2_test.shape)
print("shape of output - testing set", y2_test.shape)

shape of original dataset : (14, 5)
shape of input - training set (11, 4)
shape of output - training set (11,)
shape of input - testing set (3, 4)
shape of output - testing set (3,)


# Decision Tree Classifier

## Breast Cancer

In [161]:
# Decision Tree Classifier - Breast Cancer
from sklearn import tree
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, export_text

clf = tree.DecisionTreeClassifier()
clf = clf.fit(x1_train, y1_train)
y_pred_1 = clf.predict(x1_test)

acc_dt_1 =metrics.accuracy_score(y1_test, y_pred_1)
f1_dt_1 = metrics.f1_score(y1_test, y_pred_1)

print("Accuracy:",acc_dt_1)
print("F1:",f1_dt_1)

#show graph
r = export_text(clf, feature_names=x1.columns.values.tolist())
print(r)

Accuracy: 0.9210526315789473
F1: 0.9323308270676692
|--- worst radius <= 16.81
|   |--- worst concave points <= 0.14
|   |   |--- radius error <= 0.63
|   |   |   |--- worst area <= 785.75
|   |   |   |   |--- class: 1
|   |   |   |--- worst area >  785.75
|   |   |   |   |--- worst perimeter <= 102.70
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- worst perimeter >  102.70
|   |   |   |   |   |--- concave points error <= 0.02
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- concave points error >  0.02
|   |   |   |   |   |   |--- class: 0
|   |   |--- radius error >  0.63
|   |   |   |--- symmetry error <= 0.03
|   |   |   |   |--- class: 0
|   |   |   |--- symmetry error >  0.03
|   |   |   |   |--- class: 1
|   |--- worst concave points >  0.14
|   |   |--- worst texture <= 25.67
|   |   |   |--- worst area <= 810.30
|   |   |   |   |--- mean concave points <= 0.08
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- mean concave points >  0.08
|   |   |   |

## Play Tennis

In [162]:
#Decision Tree Classifier - Play Tennis

clf = tree.DecisionTreeClassifier()
clf = clf.fit(x2_train, y2_train)
y_pred_2 = clf.predict(x2_test)

acc_dt_2 =metrics.accuracy_score(y2_test, y_pred_2)
f1_dt_2 = metrics.f1_score(y2_test, y_pred_2)

print("Accuracy:",acc_dt_2)
print("F1:",f1_dt_2)

#show graph
r = export_text(clf, feature_names=x2.columns.values.tolist())
print(r)

Accuracy: 0.6666666666666666
F1: 0.8
|--- humidity <= 0.50
|   |--- outlook <= 1.50
|   |   |--- wind <= 0.50
|   |   |   |--- outlook <= 0.50
|   |   |   |   |--- class: 1
|   |   |   |--- outlook >  0.50
|   |   |   |   |--- class: 0
|   |   |--- wind >  0.50
|   |   |   |--- class: 1
|   |--- outlook >  1.50
|   |   |--- class: 0
|--- humidity >  0.50
|   |--- class: 1



# ID3 Decision Tree

## Breast Cancer

In [163]:
# ID3 Decision tree  - Breast Cancer
from id3 import Id3Estimator, export_text

clf = Id3Estimator()
clf.fit(x1_train, y1_train, check_input=True)
y_pred_1 = clf.predict(x1_test)

acc_id3_1 =metrics.accuracy_score(y2_test, y_pred_2)
f1_id3_1 = metrics.f1_score(y2_test, y_pred_2)

print("Accuracy:",acc_id3_1)
print("F1:",f1_id3_1)

#show graph

print(export_text(clf.tree_, x1.columns.values.tolist()))

Accuracy: 0.6666666666666666
F1: 0.8

worst perimeter <=116.05
|   worst concave points <=0.11
|   |   mean radius <=14.98
|   |   |   radius error <=0.63: 1 (245) 
|   |   |   radius error >0.63
|   |   |   |   mean texture <=18.93: 0 (1) 
|   |   |   |   mean texture >18.93: 1 (1) 
|   |   mean radius >14.98
|   |   |   mean texture <=16.30: 1 (2) 
|   |   |   mean texture >16.30: 0 (3) 
|   worst concave points >0.11
|   |   worst texture <=25.74
|   |   |   worst radius <=15.45: 1 (18) 
|   |   |   worst radius >15.45
|   |   |   |   mean radius <=14.20: 0 (3) 
|   |   |   |   mean radius >14.20: 1 (8) 
|   |   worst texture >25.74
|   |   |   mean concave points <=0.05
|   |   |   |   compactness error <=0.02
|   |   |   |   |   mean area <=562.00: 1 (1) 
|   |   |   |   |   mean area >562.00: 0 (4) 
|   |   |   |   compactness error >0.02: 1 (10) 
|   |   |   mean concave points >0.05
|   |   |   |   perimeter error <=1.62
|   |   |   |   |   mean radius <=13.34: 0 (1) 
|   |   |

## Play Tennis

In [164]:
# ID3 Decision tree Estimator - Play Tennis
from id3 import Id3Estimator

clf = Id3Estimator()
clf.fit(x2_train, y2_train, check_input=True)
y_pred_2 = clf.predict(x2_test)

acc_id3_2 =metrics.accuracy_score(y2_test, y_pred_2)
f1_id3_2 = metrics.f1_score(y2_test, y_pred_2)

print("Accuracy:",acc_id3_2)
print("F1:",f1_id3_2)

#show graph

print(export_text(clf.tree_, x2.columns.values.tolist()))

Accuracy: 0.6666666666666666
F1: 0.8

humidity <=0.50
|   outlook <=1.50
|   |   wind <=0.50: 0 (1/1) 
|   |   wind >0.50: 1 (2) 
|   outlook >1.50: 0 (3) 
humidity >0.50: 1 (4) 



# Network Algorithm

## Breast Cancer

In [165]:
#Network Algorithm - Breast Cancer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


clf = MLPClassifier(random_state=1,max_iter=1000).fit(x1_train,y1_train)
clf.predict_proba(x1_train)
npredict = clf.predict(x1_test)

acc_na_1 =metrics.accuracy_score(y1_test, npredict)
f1_na_1 = metrics.f1_score(y1_test, npredict)

print("Accuracy:",acc_na_1)
print("F1:",f1_na_1)



Accuracy: 0.9385964912280702
F1: 0.9496402877697843


# Play Tennis

In [166]:
#Network Algorithm - Play Tennis

clf = MLPClassifier(random_state=1,max_iter=1000).fit(x2_train,y2_train)
clf.predict_proba(x2_train)
npredict = clf.predict(x2_test)

acc_na_2 =metrics.accuracy_score(y2_test, npredict)
f1_na_2 = metrics.f1_score(y2_test, npredict)

print("Accuracy:",acc_na_2)
print("F1:",f1_na_2)


Accuracy: 0.6666666666666666
F1: 0.8


# Logistixc Regression

## Breast Cancer

In [167]:
# Logistixc Regression Breast Cancer
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=10000).fit(x1_train, y1_train)
y_pred_1 = clf.predict(x1_test)

acc_lr_1 =metrics.accuracy_score(y1_test, y_pred_1)
f1_lr_1 = metrics.f1_score(y1_test, y_pred_1)

print("Accuracy:",acc_lr_1)
print("F1:",f1_lr_1)

Accuracy: 0.956140350877193
F1: 0.9645390070921985


## Play Tennis

In [168]:
# Logistixc Regression Play Tennis

clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=3000).fit(x2_train, y2_train)
y_pred_2 = clf.predict(x2_test)

acc_lr_2 =metrics.accuracy_score(y2_test, y_pred_2)
f1_lr_2 = metrics.f1_score(y2_test, y_pred_2)

print("Accuracy:",acc_lr_2)
print("F1:",f1_lr_2)


Accuracy: 0.6666666666666666
F1: 0.8


# K Means

## Breast Cancer

In [175]:
#K Means - Breast Cancer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

kmeans = KMeans(2)
kmeans.fit_predict(x1_train,y1_train)
kpredict = kmeans.predict(x1_test)

acc_km_1 = metrics.accuracy_score(y1_test, kpredict)
f1_km_1 = metrics.f1_score(y1_test, kpredict)

print("Accuracy:",acc_km_1)
print("F1:",f1_km_1)




Accuracy: 0.868421052631579
F1: 0.9019607843137255


## Play Tennis

In [176]:
#K Means - Play Tennis

kmeans = KMeans(2)
kmeans.fit_predict(x2_train,y2_train)
kpredict = kmeans.predict(x2_test)

acc_km_2 =metrics.accuracy_score(y2_test, kpredict)
f1_km_2 = metrics.f1_score(y2_test, kpredict)

print("Accuracy:",acc_km_2)
print("F1:",f1_km_2)



Accuracy: 0.3333333333333333
F1: 0.5


# SVC

## Breast Cancer

In [177]:
#SVC - Breast Cancer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(x1_train,y1_train)
spredict = clf.predict(x1_test)

acc_svc_1 =metrics.accuracy_score(y1_test, spredict)
f1_svc_1 = metrics.f1_score(y1_test, spredict)

print("Accuracy:",acc_svc_1)
print("F1:",f1_svc_1)


Accuracy: 0.9736842105263158
F1: 0.9781021897810219


## Play Tennis

In [178]:
#SVC - Play Tennis
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(x2_train,y2_train)
spredict = clf.predict(x2_test)

acc_svc_2 =metrics.accuracy_score(y2_test, spredict)
f1_svc_2 = metrics.f1_score(y2_test, spredict)

print("Accuracy:",acc_svc_2)
print("F1:",f1_svc_2)


Accuracy: 0.6666666666666666
F1: 0.8


# Analysis & Comparison

## Breast Cancer 

In [179]:
algo = ['DecisionTreeClassifier', 'Id3Estimator', 'K Means', 'Logistic Regression', 'Neural_network', 'SVM']
acc = [acc_dt_1, acc_id3_1, acc_km_1, acc_lr_1, acc_na_1, acc_svc_1]
f1 = [f1_dt_1, f1_id3_1, f1_km_1, f1_lr_1, f1_na_1, f1_svc_1]
comparison = {'Accuracy': acc, 'F1': f1}

comparison_tab = pd.DataFrame(data=comparison, index=algo)
comparison_tab

Unnamed: 0,Accuracy,F1
DecisionTreeClassifier,0.921053,0.932331
Id3Estimator,0.666667,0.8
K Means,0.868421,0.901961
Logistic Regression,0.95614,0.964539
Neural_network,0.938596,0.94964
SVM,0.973684,0.978102


## PlayTennis

In [180]:
algo = ['DecisionTreeClassifier', 'Id3Estimator', 'K Means', 'Logistic Regression', 'Neural_network', 'SVM']
acc = [acc_dt_2, acc_id3_2, acc_km_2, acc_lr_2, acc_na_2, acc_svc_2]
f1 = [f1_dt_2, f1_id3_2, f1_km_2, f1_lr_2, f1_na_2, f1_svc_2]
comparison = {'Accuracy': acc, 'F1': f1}

comparison_tab = pd.DataFrame(data=comparison, index=algo)
comparison_tab

Unnamed: 0,Accuracy,F1
DecisionTreeClassifier,0.666667,0.8
Id3Estimator,0.666667,0.8
K Means,0.333333,0.5
Logistic Regression,0.666667,0.8
Neural_network,0.666667,0.8
SVM,0.666667,0.8
