# Analyzing Dataset

   Following notebook is used for analyzing Breast Cancer data set provided with Sklearn library.

### Step 1
   We will import Dataset from sklearn library as well as some other import libraries required for analysis.

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

### Step 2
   Loading the dataset in pandas dataframe.
    <ul>
    <li>X - Features</li>
    <li>y - Malignant or Benign</li>
    </ul>

In [2]:
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']], columns= np.append(cancer['feature_names'], ['target']))
X = df[df.columns[:-1]]
y = df[['target']]

In [3]:
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [4]:
y

Unnamed: 0,target
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
564,0.0
565,0.0
566,0.0
567,0.0


### Step 3
   Get Dimension of the Dataset and analysing percent of cases.

In [5]:
print('Number of example:',len(y))

print('Number of Features:',len(X.columns))

Number of example: 569
Number of Features: 30


In [7]:
percent_of_begnin = (np.mean(y))*100
print('Percent of Benign Cases:',percent_of_begnin[0])

percent_of_malignant = (1-np.mean(y))*100
print('Percent of Malignant Cases:',percent_of_malignant[0])

Percent of Benign Cases: 62.741652021089635
Percent of Malignant Cases: 37.258347978910365


### Step 4
Analyzing feaures.

In [8]:
feature_analysis_both = pd.DataFrame(columns=['Feature','Average','Standard Deviation','Max','Min'])

for i in X.columns:
    feature_analysis_both = feature_analysis_both.append({'Feature':i,'Average':np.mean(X[i]),'Standard Deviation':np.std(X[i]),'Max':np.max(X[i]),'Min':np.min(X[i])},ignore_index=True)

feature_analysis_both

Unnamed: 0,Feature,Average,Standard Deviation,Max,Min
0,mean radius,14.127292,3.520951,28.11,6.981
1,mean texture,19.289649,4.297255,39.28,9.71
2,mean perimeter,91.969033,24.277619,188.5,43.79
3,mean area,654.889104,351.604754,2501.0,143.5
4,mean smoothness,0.09636,0.014052,0.1634,0.05263
5,mean compactness,0.104341,0.052766,0.3454,0.01938
6,mean concavity,0.088799,0.07965,0.4268,0.0
7,mean concave points,0.048919,0.038769,0.2012,0.0
8,mean symmetry,0.181162,0.02739,0.304,0.106
9,mean fractal dimension,0.062798,0.007054,0.09744,0.04996


In [9]:
X['target'] = y
X_0 = X[X['target']==0.0]
X_1 =  X[X['target']==1.0]

In [10]:
print('Analysis of Features (Malignant Cases):')

feature_analysis_malignant = pd.DataFrame(columns=['Feature','Average','Standard Deviation','Max','Min'])

for i in X_0.columns[:-1]:
    feature_analysis_malignant = feature_analysis_malignant.append({'Feature':i,'Average':np.mean(X_0[i]),'Standard Deviation':np.std(X_0[i]),'Max':np.max(X_0[i]),'Min':np.min(X_0[i])},ignore_index=True)

feature_analysis_malignant

Analysis of Features (Malignant Cases):


Unnamed: 0,Feature,Average,Standard Deviation,Max,Min
0,mean radius,17.46283,3.196406,28.11,10.95
1,mean texture,21.604906,3.770546,39.28,10.38
2,mean perimeter,115.365377,21.803048,188.5,71.9
3,mean area,978.376415,367.069174,2501.0,361.6
4,mean smoothness,0.102898,0.012578,0.1447,0.07371
5,mean compactness,0.145188,0.05386,0.3454,0.04605
6,mean concavity,0.160775,0.074842,0.4268,0.02398
7,mean concave points,0.08799,0.034293,0.2012,0.02031
8,mean symmetry,0.192909,0.027573,0.304,0.1308
9,mean fractal dimension,0.06268,0.007555,0.09744,0.04996


In [11]:
print('Analysis of Features (Benign Cases):')

feature_analysis_benign = pd.DataFrame(columns=['Feature','Average','Standard Deviation','Max','Min'])

for i in X_1.columns[:-1]:
    feature_analysis_benign = feature_analysis_benign.append({'Feature':i,'Average':np.mean(X_1[i]),'Standard Deviation':np.std(X_1[i]),'Max':np.max(X_1[i]),'Min':np.min(X_1[i])},ignore_index=True)

feature_analysis_benign

Analysis of Features (Benign Cases):


Unnamed: 0,Feature,Average,Standard Deviation,Max,Min
0,mean radius,12.146524,1.778016,17.85,6.981
1,mean texture,17.914762,3.989525,33.81,9.71
2,mean perimeter,78.075406,11.790889,114.6,43.79
3,mean area,462.790196,134.098909,992.1,143.5
4,mean smoothness,0.092478,0.013427,0.1634,0.05263
5,mean compactness,0.080085,0.033703,0.2239,0.01938
6,mean concavity,0.046058,0.043381,0.4108,0.0
7,mean concave points,0.025717,0.015886,0.08534,0.0
8,mean symmetry,0.174186,0.024772,0.2743,0.106
9,mean fractal dimension,0.062867,0.006738,0.09575,0.05185
