<a href="https://colab.research.google.com/github/blessjal/blessjal/blob/main/breast_cancer_prediction_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [227]:
import pandas as pd  
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix

%matplotlib inline
plt.style.use('ggplot')

In [228]:
df = pd.read_csv('/content/sample_data/breast-cancer.data')

df.head()

Unnamed: 0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no.1
0,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
2,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
3,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
4,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no


In [229]:
 
columns = [ 'Class', 'age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat']
df.columns = columns
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
2,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
3,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
4,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no


In [230]:
df['age'] = df['age'].map({'20-29':25.5, '30-39':35.5, '40-49':45.5, '50-59':55.5, '60-69':65.5, '70-79':75.5})

In [231]:
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,45.5,premeno,20-24,0-2,no,2,right,right_up,no
1,no-recurrence-events,45.5,premeno,20-24,0-2,no,2,left,left_low,no
2,no-recurrence-events,65.5,ge40,15-19,0-2,no,2,right,left_up,no
3,no-recurrence-events,45.5,premeno,0-4,0-2,no,2,right,right_low,no
4,no-recurrence-events,65.5,ge40,15-19,0-2,no,2,left,left_low,no


In [232]:
df['inv-nodes'].value_counts()

0-2      212
3-5       36
6-8       17
9-11      10
15-17      6
12-14      3
24-26      1
Name: inv-nodes, dtype: int64

In [233]:
df['tumor-size'] = df['tumor-size'].map({'30-40':35, '25-29':27, '20-24':22, '15-19':17, '10-14':12, '40-44':42, '0-4':2, '50-54':52, '5-9':7, '45-49':47})

In [234]:
df['inv-nodes'] = df['inv-nodes'].map({'0-2':2, '3-5':4, '6-8':6, '9-11':6, '15-17':6, '12-14':6, '24-26':6})

In [235]:
df['node-caps'].replace('?', np.nan, inplace=True)


In [238]:
df['breast-quad'].replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

In [239]:
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,45.5,premeno,22.0,2,no,2,right,right_up,no
1,no-recurrence-events,45.5,premeno,22.0,2,no,2,left,left_low,no
2,no-recurrence-events,65.5,ge40,17.0,2,no,2,right,left_up,no
3,no-recurrence-events,45.5,premeno,2.0,2,no,2,right,right_low,no
4,no-recurrence-events,65.5,ge40,17.0,2,no,2,left,left_low,no


In [240]:
X = df.drop('Class', axis=1)
y = df['Class']

X.shape, y.shape

((201, 9), (201,))

In [241]:
df.isna().sum()

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

In [242]:
X.columns

Index(['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig',
       'breast', 'breast-quad', 'irradiat'],
      dtype='object')

In [243]:
rf = RandomForestClassifier()

In [244]:
from sklearn.model_selection import cross_val_score

cross_val_score(rf, X, y, cv=5, scoring='accuracy').mean()

ValueError: could not convert string to float: 'ge40'

ValueError: could not convert string to float: 'premeno'

ValueError: could not convert string to float: 'premeno'

ValueError: could not convert string to float: 'premeno'

ValueError: could not convert string to float: 'premeno'



nan

In [245]:
from sklearn.compose import make_column_transformer

In [246]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ['menopause', 'node-caps', 'breast', 'breast-quad', 'irradiat', 'tumor-size', 'inv-nodes']),
    remainder='passthrough'
)

In [247]:
column_trans.fit_transform(X)

array([[ 0. ,  0. ,  1. , ...,  0. , 45.5,  2. ],
       [ 0. ,  0. ,  1. , ...,  0. , 45.5,  2. ],
       [ 1. ,  0. ,  0. , ...,  0. , 65.5,  2. ],
       ...,
       [ 1. ,  0. ,  0. , ...,  1. , 55.5,  3. ],
       [ 0. ,  0. ,  1. , ...,  0. , 35.5,  3. ],
       [ 1. ,  0. ,  0. , ...,  0. , 65.5,  1. ]])

In [248]:
from sklearn.pipeline import make_pipeline

In [249]:
pipe = make_pipeline(column_trans, rf)

In [250]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.731341463414634

In [251]:
X_new =  X.sample(5, random_state=99)
X_new

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
85,65.5,ge40,27.0,2,no,2,right,left_low,no
31,55.5,premeno,12.0,2,no,3,left,left_low,no
63,45.5,premeno,27.0,2,no,1,left,right_low,no
236,45.5,premeno,27.0,2,no,2,right,left_low,no
119,65.5,ge40,22.0,2,no,1,left,left_low,no


In [252]:
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['menopause', 'node-caps',
                                                   'breast', 'breast-quad',
                                                   'irradiat', 'tum...
                 RandomForestClassifier

In [253]:
pipe.predict(X_new)

array(['no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'recurrence-events',
       'no-recurrence-events'], dtype=object)

In [260]:
df.loc[63,]

Class          no-recurrence-events
age                            45.5
menopause                   premeno
tumor-size                       27
inv-nodes                         2
node-caps                        no
deg-malig                         1
breast                         left
breast-quad               right_low
irradiat                         no
Name: 63, dtype: object