# Breast Cancer Dataset

In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

### Read dataset

In [None]:
breast_cancer = load_breast_cancer()
print(len(breast_cancer.feature_names))
data = pd.DataFrame(breast_cancer.data,columns=breast_cancer.feature_names)
data['target'] = pd.Series(breast_cancer.target)
data.head()

In [None]:
data.info()

In [None]:
data.describe()

### Data Analysis

In [None]:
col = data.columns       # .columns gives columns names in data 
print(col)

- mean
- standard error
- worst (mean of the three largest values)

Of these features were computed for each image, resulting in 30 features.

All feature values are recoded with four significant digits.

Missing attribute values: none

In [None]:
import seaborn as sns 

ax = sns.countplot(data.target,label="Count")
B, M = data.target.value_counts()
print('Number of Benign: ',B)
print('Number of Malignant : ',M)

### Data Visualization

In [None]:
print(data.columns)

Our dataset already contains the mean values of all the columns


In [None]:
featureMeans = list(data.columns[:10])
featureMeans

In [None]:
correlationData = data[featureMeans].corr()
sns.pairplot(data[featureMeans].corr(), diag_kind='kde', size=2);

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,10))
sns.heatmap(data[featureMeans].corr(), annot=True, square=True, cmap='coolwarm')
plt.show()

In [None]:
bins = 12
plt.figure(figsize=(15,15))
for idx,atr in enumerate(featureMeans):
    plt.subplot(5, 2, idx+1)
    sns.distplot(data[data['target']==1][atr], bins=bins, color='green', label='M')
    sns.distplot(data[data['target']==0][atr], bins=bins, color='red', label='B')
    plt.legend(loc='upper right')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

pca = PCA(n_components=2)
X_r = pca.fit_transform(data.loc[:,featureMeans])

colors = ['navy', 'turquoise']
for color, i, target_name in zip(colors, [0, 1], data.loc[:, 'target']):
    plt.scatter(X_r[data.loc[:, 'target'] == i, 0], X_r[data.loc[:, 'target'] == i, 1], color=color, alpha=.8, label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA')
plt.plot()

### Clean Dataset

In [None]:
X = data.loc[:,featureMeans]
y = data.loc[:, 'target']

### Data Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.95, random_state = 42)