In [1]:
import pandas as pd
import numpy as np
from IPython.display import display_html
%matplotlib inline

In [2]:

def discretization(data):
    print("Discretizing 'Clump Thickness' attributes of the breast cancer dataset visualizing distribution of attribute value")
    print(data['Clump Thickness'].value_counts(sort=False))
    print("For the equal width method, we can apply the cut() function to discretize the attribute into 4 bins of similar interview widths.")
    print("The value_counts() function can be used to determine the number of instances in each bin.")
    bins = pd.cut(data['Clump Thickness'], 4)
    print(bins.value_counts(sort=False))
    print("For the equal frequency method, the qcut() function can be used to partition the values into 4 bins such that each bin has nearly the same number of instances.")
    bins = pd.qcut(data['Clump Thickness'], 4)
    print(bins.value_counts(sort=False))

In [3]:
def sampling(data):
    print("Displaying the first five records of the table without sampling.")
    display_html(data.head())
    print("A sample of size 3 is randomly selected (without replacement) from the original data.")
    sample = data.sample(n=3)
    display_html(sample)
    print("Randomly select 1% of the data (without replacement) and display the selected samples.")
    sample = data.sample(frac=0.01, random_state=1)
    display_html(sample)
    print("A sampling with replacement to create a sample whose size is equal to 1% of the entire data.")
    sample = data.sample(frac=0.01, replace=True, random_state=1)
    display_html(sample)

In [4]:
def remove_duplicate(data):
    dups = data.duplicated()
    print(f"Number of duplicated rows = {dups.sum()}")
    print(f"Number of rows before discarding duplicates: {data.shape[0]}")
    
    # Displaying rows with index 11 and 28
    print("Rows with index 11 and 28:")
    print(data.loc[[11, 28]])
    
    data2 = data.drop_duplicates()
    print(f"Number of rows after discarding duplicates = {data2.shape[0]}")

In [5]:
def outlier(data):
    data2 = data.drop(['Class'], axis=1)
    data2["Bare Nuclei"] = pd.to_numeric(data2["Bare Nuclei"])
    Z = (data2 - data2.mean()) / data2.std()
    Z_slice = Z[20:25]  # Store the slice of Z for display
    print("Slice of Z from row 20 to 25:")
    print(Z_slice)
    
    print(f"Number of rows before discarding outliers = {Z.shape[0]}")
    
    # Filtering rows where all values are within the range [-3, 3]
    Z2 = Z[((Z > -3).sum(axis=1) == 9) & ((Z <= 3).sum(axis=1) == 9)]
    print(f"Number of rows after discarding outliers = {Z2.shape[0]}")


In [6]:
def remove_missing(data):
    print(f"Number of rows in original data: {data.shape[0]}")
    data = data.dropna()
    print(f"Number of rows after discarding missing values: {data.shape[0]}")

In [7]:
def replace_missing_value_by_median(data):
    data_copy = data.copy()  # Create a copy of the DataFrame
    data2 = data_copy['Bare Nuclei']
    print("Before replacing missing values:")
    print(data2[20:25])
    median_value = data2.median()  # Calculate the median
    data2 = data2.fillna(median_value)  # Replace missing values with median
    print("After replacing missing values by median:")
    print(data2[20:25])

In [8]:
def noise_handle(data):
    data = data.drop(['Sample code'], axis=1)
    
    data = data.replace('?', np.NaN)
    
    print(f"Number of instances = {data.shape[0]}")
    print(f"Number of attributes = {data.shape[1]}")
    print("Number of missing values: ")
    for col in data.columns:
        print('\t%s: %d' % (col, data[col].isna().sum()))
    
    print("To further preprocess, select an option:\n"
          "0. Exit\n"
          "1. Replace missing value by median\n"
          "2. Remove missing value\n"
          "3. Handle outlier\n"
          "4. Remove duplicate\n"
          "5. Sampling\n"
          "6. Discretization:")
    
    option = int(input())
    while option != 0:
        if option == 1:
            replace_missing_value_by_median(data) 
        elif option == 2:
            remove_missing(data)
        elif option == 4:
            remove_duplicate(data)
        elif option == 5:
            sampling(data)
        elif option == 6:
            discretization(data)
        else:
            print("Enter correct choice")
        
        print("Select your option again:")
        option = int(input())

In [9]:
def view(data):
    print("First five rows of the dataframe:")
    display(data.head())
    print(f"Number of instances = {data.shape[0]}")
    print(f"Number of attributes = {data.shape[1]}")

In [10]:
def main():
    data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",header=None)
    data.columns = ['Sample code','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
    print("Do you want to view data?")
    response = input().lower()
    if response == 'yes':
        view(data)
        
    print("Do you want to remove noise and further preprocess data?")
    response = input().lower()
    if response == 'yes':
        noise_handle(data)

In [None]:
main()

Do you want to view data?
yes
First five rows of the dataframe:


Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


Number of instances = 699
Number of attributes = 11
Do you want to remove noise and further preprocess data?
yes
Number of instances = 699
Number of attributes = 10
Number of missing values: 
	Clump Thickness: 0
	Uniformity of Cell Size: 0
	Uniformity of Cell Shape: 0
	Marginal Adhesion: 0
	Single Epithelial Cell Size: 0
	Bare Nuclei: 16
	Bland Chromatin: 0
	Normal Nucleoli: 0
	Mitoses: 0
	Class: 0
To further preprocess, select an option:
0. Exit
1. Replace missing value by median
2. Remove missing value
3. Handle outlier
4. Remove duplicate
5. Sampling
6. Discretization:
1
Before replacing missing values:
20     10
21      7
22      1
23    NaN
24      1
Name: Bare Nuclei, dtype: object
After replacing missing values by median:
20     10
21      7
22      1
23    1.0
24      1
Name: Bare Nuclei, dtype: object
Select your option again:
