In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

### Data Cleaning functions

In [85]:
def read_data(file_path, column_names):
    return pd.read_csv(file_path, names = column_names)

def inital_data_info(data):
    '''
    Intial Data Check
    -> Identifies any missing values
    -> Helps understand the basic characteristics of raw data
    '''
    print("Initial data info:")
    print(data.info())
    print("\nMissing values:")
    print(data.isnull().sum())

def replace_question_mark_with_NaN(data):
    '''
    Replace all '?' with NaN
    -> Missing values are represented as '?' in the dataset
    -> Convert them to NaN to handle them in the next step
    -> Provides a clear output of actual missing values after conversion
    '''
    data = data.replace('?', np.nan)
    print("\nMissing values after replacing '?':")
    print(data.isnull().sum())

def handle_missing_values_using_mode(data):
    '''
    Handle missing values using mode
    -> Used for 'node-caps' and 'breast-quad' columns which had missing values
    -> Mode imputation is appropriate here as these are categorical variables
    '''
    data['node-caps'] = data['node-caps'].fillna(data['node-caps'].mode()[0])
    data['breast-quad'] = data['breast-quad'].fillna(data['breast-quad'].mode()[0])

def encode_categorical_columns(data):
    '''
    Define categorical columns and encode them
    -> These columns contain string values that need to be converted to numerical format for machine learning algorithms
    -> LabelEncoder is used for encoding categorical variables
    '''
    categorical_columns = ['Class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 
                         'node-caps', 'breast', 'breast-quad', 'irradiat']
    
    label_encoders = {}
    for column in categorical_columns:
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])
    return label_encoders #, data

def convert_deg_malig_to_int(data):
    '''
    Convert deg-malig to int
    -> Ensure that the 'deg-malig' column is in integer format as required by many machine learning algorithms
    '''
    data['deg-malig'] = data['deg-malig'].astype(int)

# The functions at this point are used to print out my data whenever i run them

def print_cleaned_data_info(data):
    '''
    Print cleaned data info
    -> Provides a summary of the cleaned data, including data types and missing values
    -> Helps verify that the data has been processed correctly
    '''
    print("\nCleaned data info:")
    print(data.info())
    print("\nFirst few rows of cleaned data:")
    print(data.head())

def print_summary_statistics(data):
    '''
    Print statistical summary
    -> Provides a statistical summary of the cleaned data
    -> Includes count, mean, std, min, max, and quartiles for numerical columns
    -> Helps understand the distribution and central tendencies of the data
    '''
    print("\nStatistical summary:")
    print(data.describe())

def create_encoding_mappings_dictionary(data, label_encoders):
    '''
    Create encoding mappings dictionary
    -> Stores the encoding mappings for each categorical column
    -> Helps in understanding the encoding process and can be used for future predictions
    '''
    categorical_columns = ['Class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 
                         'node-caps', 'breast', 'breast-quad', 'irradiat']
    encoding_mappings = {}
    for column in categorical_columns:
        encoding_mappings[column] = dict(zip(label_encoders[column].classes_, 
                                           label_encoders[column].transform(label_encoders[column].classes_)))
    
    print("\nEncoding mappings for categorical variables:")
    for column, mapping in encoding_mappings.items():
        print(f"\n{column}:")
        for original, encoded in mapping.items():
            print(f"{original} -> {encoded}")
    
    return data, encoding_mappings

### Description of what numbers in dataset correlate to 

In [86]:
"""
   1. Class: no-recurrence-events, recurrence-events
   2. age: 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99.
   3. menopause: lt40, ge40, premeno.
   4. tumor-size: 0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44,
                  45-49, 50-54, 55-59.
   5. inv-nodes: 0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26,
                 27-29, 30-32, 33-35, 36-39.
   6. node-caps: yes, no.
   7. deg-malig: 1, 2, 3.
   8. breast: left, right.
   9. breast-quad: left-up, left-low, right-up,	right-low, central.
   10. irradiat:	yes, no.
"""
column_names = ['Class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']
unclean_data = read_data('breast-cancer.data', column_names)

In [99]:
"""
Here, I am creating a copy of the original dataframe so that I do not mess with the original dataframe. I only performed data cleaning and
preprocessing on the copy of the unlean data
"""
data = unclean_data.copy()

### Data cleaning functions being applied to dataset

In [100]:
data.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


#### Calling all the functions

I am calling all my functions that I defined for data cleaning and preprocessing.
The functions take the data as the argument and spit out the processed data.

In [101]:
inital_data_info(data)

Initial data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumor-size   286 non-null    object
 4   inv-nodes    286 non-null    object
 5   node-caps    286 non-null    object
 6   deg-malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast-quad  286 non-null    object
 9   irradiat     286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB
None

Missing values:
Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64


In [102]:
replace_question_mark_with_NaN(data)


Missing values after replacing '?':
Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiat       0
dtype: int64


In [91]:
handle_missing_values_using_mode(data)

In [92]:
label_encoding = encode_categorical_columns(data)

In [93]:
convert_deg_malig_to_int(data)

In [94]:
print_cleaned_data_info(data)


Cleaned data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Class        286 non-null    int64
 1   age          286 non-null    int64
 2   menopause    286 non-null    int64
 3   tumor-size   286 non-null    int64
 4   inv-nodes    286 non-null    int64
 5   node-caps    286 non-null    int64
 6   deg-malig    286 non-null    int64
 7   breast       286 non-null    int64
 8   breast-quad  286 non-null    int64
 9   irradiat     286 non-null    int64
dtypes: int64(10)
memory usage: 22.5 KB
None

First few rows of cleaned data:
   Class  age  menopause  tumor-size  inv-nodes  node-caps  deg-malig  breast  \
0      0    1          2           5          0          1          3       0   
1      0    2          2           3          0          1          2       1   
2      0    2          2           3          0          1          2     

In [95]:
print_summary_statistics(data)


Statistical summary:
            Class         age   menopause  tumor-size   inv-nodes   node-caps  \
count  286.000000  286.000000  286.000000  286.000000  286.000000  286.000000   
mean     0.297203    2.664336    1.073427    4.062937    1.073427    1.167832   
std      0.457828    1.011818    0.986680    2.151187    1.935321    0.443052   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.000000    2.000000    0.000000    3.000000    0.000000    1.000000   
50%      0.000000    3.000000    2.000000    4.000000    0.000000    1.000000   
75%      1.000000    3.000000    2.000000    5.000000    1.000000    1.000000   
max      1.000000    5.000000    2.000000   10.000000    6.000000    2.000000   

        deg-malig      breast  breast-quad    irradiat  
count  286.000000  286.000000   286.000000  286.000000  
mean     2.048951    0.468531     2.772727    0.237762  
std      0.738217    0.499883     1.099006    0.426459  
min      1.000000   

In [96]:
data, encoding_mappings = create_encoding_mappings_dictionary(data, label_encoding)


Encoding mappings for categorical variables:

Class:
no-recurrence-events -> 0
recurrence-events -> 1

age:
20-29 -> 0
30-39 -> 1
40-49 -> 2
50-59 -> 3
60-69 -> 4
70-79 -> 5

menopause:
ge40 -> 0
lt40 -> 1
premeno -> 2

tumor-size:
0-4 -> 0
10-14 -> 1
15-19 -> 2
20-24 -> 3
25-29 -> 4
30-34 -> 5
35-39 -> 6
40-44 -> 7
45-49 -> 8
5-9 -> 9
50-54 -> 10

inv-nodes:
0-2 -> 0
12-14 -> 1
15-17 -> 2
24-26 -> 3
3-5 -> 4
6-8 -> 5
9-11 -> 6

node-caps:
? -> 0
no -> 1
yes -> 2

breast:
left -> 0
right -> 1

breast-quad:
? -> 0
central -> 1
left_low -> 2
left_up -> 3
right_low -> 4
right_up -> 5

irradiat:
no -> 0
yes -> 1


In [97]:
data.head(20)

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,0,1,2,5,0,1,3,0,2,0
1,0,2,2,3,0,1,2,1,5,0
2,0,2,2,3,0,1,2,0,2,0
3,0,4,0,2,0,1,2,1,3,0
4,0,2,2,0,0,1,2,1,4,0
5,0,4,0,2,0,1,2,0,2,0
6,0,3,2,4,0,1,2,0,2,0
7,0,4,0,3,0,1,1,0,2,0
8,0,2,2,10,0,1,2,0,2,0
9,0,2,2,3,0,1,2,1,3,0


#### To ensure I can use my cleaned dataset and encoding mappings for my visuals jupyter notebook..

I used the to_csv function of pandas to save MY cleaned dataset as a csv file.
And I am using pickle to dump my encoding mappings into a new file so I can load my csv file and the encoding mappings in the next jupyter notebook file. You asked for two seperate notebooks, and so I used pickle to import my cleaned data to my visuals notebook so that I could use that data to create my graphs/machine learning models. Our class didnt really discuss this form of data importation but it was a trick taught to me by the CS tutoring to make it easier.

In [66]:
data.to_csv('processed_data.csv', index = False)

In [67]:
import pickle

with open("encoding_mappings.pkl", "wb") as f: 
    pickle.dump(encoding_mappings, f)

In [82]:
print(encoding_mappings)

{'Class': {'no-recurrence-events': np.int64(0), 'recurrence-events': np.int64(1)}, 'age': {'20-29': np.int64(0), '30-39': np.int64(1), '40-49': np.int64(2), '50-59': np.int64(3), '60-69': np.int64(4), '70-79': np.int64(5)}, 'menopause': {'ge40': np.int64(0), 'lt40': np.int64(1), 'premeno': np.int64(2)}, 'tumor-size': {'0-4': np.int64(0), '10-14': np.int64(1), '15-19': np.int64(2), '20-24': np.int64(3), '25-29': np.int64(4), '30-34': np.int64(5), '35-39': np.int64(6), '40-44': np.int64(7), '45-49': np.int64(8), '5-9': np.int64(9), '50-54': np.int64(10)}, 'inv-nodes': {'0-2': np.int64(0), '12-14': np.int64(1), '15-17': np.int64(2), '24-26': np.int64(3), '3-5': np.int64(4), '6-8': np.int64(5), '9-11': np.int64(6)}, 'node-caps': {'?': np.int64(0), 'no': np.int64(1), 'yes': np.int64(2)}, 'breast': {'left': np.int64(0), 'right': np.int64(1)}, 'breast-quad': {'?': np.int64(0), 'central': np.int64(1), 'left_low': np.int64(2), 'left_up': np.int64(3), 'right_low': np.int64(4), 'right_up': np.int