<a href="https://colab.research.google.com/github/blessjal/blessjal/blob/main/breast_cancer_prediction_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd  
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix

%matplotlib inline
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('/content/sample_data/breast-cancer.data')

df.head()

Unnamed: 0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no.1
0,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
2,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
3,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
4,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no


In [None]:
 
columns = [ 'Class', 'age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat']
df.columns = columns
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
2,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
3,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
4,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no


In [None]:
df['age'] = df['age'].map({'20-29':25.5, '30-39':35.5, '40-49':45.5, '50-59':55.5, '60-69':65.5, '70-79':75.5})

In [None]:
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,45.5,premeno,20-24,0-2,no,2,right,right_up,no
1,no-recurrence-events,45.5,premeno,20-24,0-2,no,2,left,left_low,no
2,no-recurrence-events,65.5,ge40,15-19,0-2,no,2,right,left_up,no
3,no-recurrence-events,45.5,premeno,0-4,0-2,no,2,right,right_low,no
4,no-recurrence-events,65.5,ge40,15-19,0-2,no,2,left,left_low,no


In [None]:
df.to_csv('/content/sample_data/breast_cancer.csv')

In [10]:
df['inv-nodes'].value_counts()

0-2      212
3-5       36
6-8       17
9-11      10
15-17      6
12-14      3
24-26      1
Name: inv-nodes, dtype: int64

In [11]:
df['tumor-size'] = df['tumor-size'].map({'30-40':35, '25-29':27, '20-24':22, '15-19':17, '10-14':12, '40-44':42, '0-4':2, '50-54':52, '5-9':7, '45-49':47})

In [12]:
df['inv-nodes'] = df['inv-nodes'].map({'0-2':2, '3-5':4, '6-8':6, '9-11':6, '15-17':6, '12-14':6, '24-26':6})

In [13]:
df['node-caps'].replace('?', np.nan, inplace=True)


In [14]:
df['breast-quad'].replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

In [15]:
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,45.5,premeno,22.0,2,no,2,right,right_up,no
1,no-recurrence-events,45.5,premeno,22.0,2,no,2,left,left_low,no
2,no-recurrence-events,65.5,ge40,17.0,2,no,2,right,left_up,no
3,no-recurrence-events,45.5,premeno,2.0,2,no,2,right,right_low,no
4,no-recurrence-events,65.5,ge40,17.0,2,no,2,left,left_low,no


In [16]:
X = df.drop('Class', axis=1)
y = df['Class']

X.shape, y.shape

((201, 9), (201,))

In [17]:
df.isna().sum()

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

In [19]:
rf = RandomForestClassifier()

In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder


In [24]:
from sklearn.compose import make_column_transformer

In [25]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ['menopause', 'node-caps', 'breast', 'breast-quad', 'irradiat', 'tumor-size', 'inv-nodes']),
    remainder='passthrough'
)

In [26]:
column_trans.fit_transform(X)

array([[ 0. ,  0. ,  1. , ...,  0. , 45.5,  2. ],
       [ 0. ,  0. ,  1. , ...,  0. , 45.5,  2. ],
       [ 1. ,  0. ,  0. , ...,  0. , 65.5,  2. ],
       ...,
       [ 1. ,  0. ,  0. , ...,  1. , 55.5,  3. ],
       [ 0. ,  0. ,  1. , ...,  0. , 35.5,  3. ],
       [ 1. ,  0. ,  0. , ...,  0. , 65.5,  1. ]])

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((160, 9), (41, 9), (160,), (41,))

In [28]:
from sklearn.pipeline import make_pipeline

In [29]:
pipe = make_pipeline(column_trans, rf)

In [30]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.78125

In [32]:
pipe.fit(X_train, y_train)
pipe.predict(X_test)

array(['no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurren

In [33]:
y_test

24     no-recurrence-events
228       recurrence-events
135    no-recurrence-events
120    no-recurrence-events
240       recurrence-events
249       recurrence-events
5      no-recurrence-events
195    no-recurrence-events
12     no-recurrence-events
201       recurrence-events
74     no-recurrence-events
165    no-recurrence-events
245       recurrence-events
204       recurrence-events
96     no-recurrence-events
7      no-recurrence-events
42     no-recurrence-events
172    no-recurrence-events
47     no-recurrence-events
88     no-recurrence-events
250       recurrence-events
193    no-recurrence-events
55     no-recurrence-events
213       recurrence-events
73     no-recurrence-events
161    no-recurrence-events
242       recurrence-events
252       recurrence-events
159    no-recurrence-events
54     no-recurrence-events
21     no-recurrence-events
68     no-recurrence-events
199    no-recurrence-events
140    no-recurrence-events
29     no-recurrence-events
261       recurrence