In [1]:
import pandas as pd
from IPython.display import display_html

def discretization(data):
    print("Discretizing 'Clump Thickness' attribute of the breast cancer dataset\nVisualizing distribution of attribute value")
    print(data['Clump Thickness'].value_counts(sort=False))

    print("For the equal width method, we can apply the cut() function to discretize the attribute\ninto 4 bins of similar interval widths.")
    print("The value_counts() function can be used to determine the number of instances in each bin.")
    bins = pd.cut(data['Clump Thickness'], 4)
    print(bins.value_counts(sort=False))

    print("For the equal frequency method, the qcut() function can be used to partition the values into 4 bins such that each bin has nearly the same number of instances.")
    bins = pd.qcut(data['Clump Thickness'], 4)
    print(bins.value_counts(sort=False))

def sampling(data):
    print("Displaying the first five records of the table Without Sampling.")
    display_html(data.head())

    print("A sample of size 3 is randomly selected (without replacement) from the original data.")
    sample_data = data.sample(n=3)
    display_html(sample_data)

    print("Randomly select 1% of the data (without replacement) and display the selected samples.")
    sample_data = data.sample(frac=0.01, random_state=1)
    display_html(sample_data)

    print("A sampling with replacement to create a sample whose size is equal to 1% of the entire data.")
    sample_data = data.sample(frac=0.01, replace=True, random_state=1)
    display_html(sample_data)

def main():
    data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
    data.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

    print("Do you want to view data?")
    response = input()
    if response.lower() == 'yes':
        display_html(data.head())

    print("Select your option:\n1. Sampling\n2. Discretization\n0. Exit")
    option = int(input())

    while option != 0:
        if option == 1:
            sampling(data)
        elif option == 2:
            discretization(data)
        else:
            print("Enter correct choice")

        print("Select your option:\n1. Sampling\n2. Discretization\n0. Exit")
        option = int(input())

if __name__ == "__main__":
    main()


Do you want to view data?


 yes


Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


Select your option:
1. Sampling
2. Discretization
0. Exit


 1


Displaying the first five records of the table Without Sampling.


Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


A sample of size 3 is randomly selected (without replacement) from the original data.


Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
379,685977,5,3,4,1,4,1,3,1,1,2
589,1272166,5,1,1,1,2,1,1,1,1,2
352,846832,3,4,5,3,7,3,4,6,1,2


Randomly select 1% of the data (without replacement) and display the selected samples.


Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
584,1217717,5,1,1,6,3,1,1,1,1,2
417,1239967,1,1,1,1,2,1,2,1,1,2
606,353098,4,1,1,2,2,1,1,1,1,2
349,832567,4,2,3,5,3,8,7,6,1,4
134,1180831,3,1,1,1,3,1,2,1,1,2
502,1253917,4,1,1,2,2,1,2,1,1,2
117,1173509,4,5,5,10,4,10,7,5,8,4


A sampling with replacement to create a sample whose size is equal to 1% of the entire data.


Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
37,1081791,6,2,1,1,1,1,7,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
72,1124651,1,3,3,2,2,1,7,2,1,2
645,1303489,3,1,1,1,2,1,2,1,1,2
144,1184241,2,1,1,1,2,1,2,1,1,2
129,1177512,1,1,1,1,10,1,1,1,1,2
583,1115762,3,1,1,1,2,1,1,1,1,2


Select your option:
1. Sampling
2. Discretization
0. Exit


 2


Discretizing 'Clump Thickness' attribute of the breast cancer dataset
Visualizing distribution of attribute value
Clump Thickness
5     130
3     108
6      34
4      80
8      46
1     145
2      50
7      23
10     69
9      14
Name: count, dtype: int64
For the equal width method, we can apply the cut() function to discretize the attribute
into 4 bins of similar interval widths.
The value_counts() function can be used to determine the number of instances in each bin.
Clump Thickness
(0.991, 3.25]    303
(3.25, 5.5]      210
(5.5, 7.75]       57
(7.75, 10.0]     129
Name: count, dtype: int64
For the equal frequency method, the qcut() function can be used to partition the values into 4 bins such that each bin has nearly the same number of instances.
Clump Thickness
(0.999, 2.0]    195
(2.0, 4.0]      188
(4.0, 6.0]      164
(6.0, 10.0]     152
Name: count, dtype: int64
Select your option:
1. Sampling
2. Discretization
0. Exit


 0
