In [322]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
le = LabelEncoder()
df = pd.read_csv('breast-cancer.csv')

In [323]:
df.rename(columns={'class': 'recurrence_events'}, inplace = True)
df.rename(columns={'tumor-size': 'tumor_size'}, inplace = True)
df.rename(columns={'inv-nodes': 'inv_nodes'}, inplace = True)
df.rename(columns={'node-caps': 'node_caps'}, inplace = True)
df.rename(columns={'deg-malig': 'deg_malig'}, inplace = True)
df.rename(columns={'breast-quead': 'breast_quead'}, inplace = True)


In [324]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   recurrence_events  286 non-null    object
 1   age                286 non-null    object
 2   menopause          286 non-null    object
 3   tumor_size         286 non-null    object
 4   inv_nodes          286 non-null    object
 5   node_caps          286 non-null    object
 6   deg_malig          286 non-null    int64 
 7   breast             286 non-null    object
 8   breast_quead       286 non-null    object
 9   irradiat           286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [325]:
print(df['recurrence_events'].unique())  
print(df["age"].unique()) 
print(df['menopause'].unique())  
print(df["tumor_size"].unique())  
print(df["inv_nodes"].unique())  
print(df['node_caps'].unique())  
print(df["deg_malig"].unique())  
print(df['breast'].unique()) 
print(df['breast_quead'].unique()) 
print(df["irradiat"].unique())

['no-recurrence-events' 'recurrence-events']
['30-39' '40-49' '60-69' '50-59' '70-79' '20-29']
['premeno' 'ge40' 'lt40']
['30-34' '20-24' '15-19' '0-4' '25-29' '50-54' '10-14' '40-44' '35-39'
 '5-9' '45-49']
['0-2' '6-8' '9-11' '3-5' '15-17' '12-14' '24-26']
['no' 'yes' '?']
[3 2 1]
['left' 'right']
['left_low' 'right_up' 'left_up' 'right_low' 'central' '?']
['no' 'yes']


In [326]:
df['recurrence_events'] = le.fit_transform(df['recurrence_events'])
df["age"] = df["age"].replace({"20-29": 0, "30-39": 1, "40-49": 2, "50-59": 3, "60-69": 4, "70-79": 5})
df['menopause'] = le.fit_transform(df['menopause'])
df["tumor_size"] = df["tumor_size"].replace({"0-4": 0, "5-9": 1, "10-14": 2, "15-19": 3, "20-24": 4, "25-29": 5, "30-34": 6, "35-39": 7, "40-44": 8, "45-49": 9, "50-54": 10})
df["inv_nodes"] = df["inv_nodes"].replace({"0-2": 0, "3-5": 1, "6-8": 2, "9-11": 3, "12-14": 4, "15-17": 5, "24-26": 6})
df['node_caps'] = le.fit_transform(df['node_caps'])
df["deg_malig"] = df["deg_malig"].replace({1: 0, 2: 1, 3: 2})
df['breast'] = le.fit_transform(df['breast'])
df['breast_quead'] = le.fit_transform(df['breast_quead'])
df["irradiat"] = df["irradiat"].replace({"no": 0, "yes": 1})

In [327]:
# Imputar los valores faltantes ('?') en la columna "breast_quead" con la moda
mode = df['breast_quead'].mode()[0]
df['breast_quead'] = df['breast_quead'].replace('?', mode)

In [328]:
# Imputar los valores faltantes ('?') en la columna "breast_quead" con la moda
mode = df['node_caps'].mode()[0]
df['node_caps'] = df['node_caps'].replace('?', mode)

In [329]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   recurrence_events  286 non-null    int32
 1   age                286 non-null    int64
 2   menopause          286 non-null    int32
 3   tumor_size         286 non-null    int64
 4   inv_nodes          286 non-null    int64
 5   node_caps          286 non-null    int32
 6   deg_malig          286 non-null    int64
 7   breast             286 non-null    int32
 8   breast_quead       286 non-null    int32
 9   irradiat           286 non-null    int64
dtypes: int32(5), int64(5)
memory usage: 16.9 KB


In [346]:
print(df["deg_malig"].unique())
print(df["deg_malig"] == 1.0)




[1.  0.5 0. ]
0       True
1      False
2      False
3      False
4      False
       ...  
281    False
282     True
283    False
284     True
285     True
Name: deg_malig, Length: 286, dtype: bool
0      False
1      False
2      False
3      False
4      False
       ...  
281    False
282    False
283     True
284    False
285    False
Name: deg_malig, Length: 286, dtype: bool


In [330]:
# Realizar la normalización min-max en la columna "deg_malig"
scaler = MinMaxScaler()
df['deg_malig'] = scaler.fit_transform(df['deg_malig'].values.reshape(-1, 1))


In [331]:
# Codificar las variables categóricas

df_encoded = df.copy()
for column in df_encoded.columns:
    if df_encoded[column].dtype == 'object':
        df_encoded[column] = le.fit_transform(df_encoded[column])

In [332]:
# Dividir los datos en características (X) y variable objetivo (y)
X = df_encoded.drop("recurrence_events", axis=1)
y = df_encoded["recurrence_events"]

In [333]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [334]:
# Crear una instancia del clasificador k-NN
knn = KNeighborsClassifier(n_neighbors=15)  # Puedes ajustar el valor de k según tus necesidades

In [335]:
# Entrenar el clasificador k-NN
knn.fit(X_train, y_train)

In [336]:
# Realizar predicciones en el conjunto de prueba
y_pred = knn.predict(X_test)


In [337]:
# Evaluar el rendimiento del clasificador
accuracy = knn.score(X_test, y_test)
print("Exactitud (Accuracy):", accuracy)

Exactitud (Accuracy): 0.7325581395348837
