In [None]:
# multiple items in output

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# load and ispect dataset

import pandas as pd
data = pd.read_csv("../data.csv", index_col=[0])
data = data.drop("Unnamed: 32", axis=1)
# benign = 1, malign = 0
data["diagnosis"] = (data["diagnosis"] == "B").astype(int)

data.columns
data.dtypes
data

In [None]:
# correlation with diagnosis column
#    0.9 to 1 positive or negative indicates a very strong correlation.
#    0.7 to 0.9 positive or negative indicates a strong correlation.
#    0.5 to 0.7 positive or negative indicates a moderate correlation.
#    0.3 to 0.5 positive or negative indicates a weak correlation.
#    0 to 0.3 positive or negative indicates a negligible correlation.
# (https://medium.com/brdata/correlation-straight-to-the-point-e692ab601f4c)

corrs = data.corr().sort_values(by="diagnosis", key=lambda x: -x.abs())
corrs[corrs["diagnosis"].abs() >= 0.7]["diagnosis"]

In [None]:
# filter data with strong correlation (see cell above)

correlated_data = data.filter(items=[
    "diagnosis",
    "concave points_worst",
    "perimeter_worst",
    "concave points_mean",
    "radius_worst",
    "perimeter_mean",
    "area_worst",
    "radius_mean",
    "area_mean"
])
correlated_data

In [None]:
# prepare training and testing data

X = correlated_data.drop("diagnosis", axis=1)
X = X.values
y = correlated_data["diagnosis"]
y = y.values

X
y

In [None]:
# split into training and testing set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=5
)

X_train
X_test
y_train
y_test