In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv('Wisconsin.csv')

In [3]:
data.shape

(569, 32)

In [4]:
# droping the id column since it's irrelevant
data.drop('id', axis = 1, inplace = True)

In [5]:
data.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [6]:
data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


assigning:
* M as Maligant Tumor to 1
* B as Benign Tumor to 0

In [None]:
df = data.copy()

In [None]:
def cat_to_binary(col):
    if col == "M":
        return 1
    else:
        return 0

For indexing purposes, separating the features in 3 distinct categories:
* feature_mean
* feature_se (standard error)
* feature_worst 

In [None]:
features_mean = df.columns[1:11]
features_se = df.columns[11:21]
features_worst = df.columns[21:31]
allfeatures = df.columns[1:]
print(features_mean)
print(features_se)
print(features_worst)

# Exploratory Data Analysis

1. Diagnosis Distribution

In [None]:
sns.countplot(x = 'diagnosis', data = df, palette ='RdBu_r')

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df[features_mean].corr(), annot=True)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df[features_worst].corr(), annot=True, cmap='BrBG')

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df[features_se].corr(), annot=True, cmap='RdBu')

Above heatmaps shows high correlation between certain features in all three cases, as follows:
* High correlation between 'radius', 'area', and 'perimeter'
* Relatively high correlation between 'compactness', 'concavity', and 'concave points'

I'll go ahead and remove features with more than .9 correlation, which are:
- perimeter_mean, perimeter_se, perimeter_worst
- area_mean, area_se, area_worst
- concave points_mean

Now let's see the correlation in features for each case 'mean', 'se', 'worst'

In [None]:
features_mean_2 = features_mean.drop(['perimeter_mean','area_mean','concave points_mean'])
features_se_2 = features_se.drop(['perimeter_se', 'area_se'])
features_worst_2 = features_worst.drop('perimeter_worst')
allfeatures_2 = allfeatures.drop(['perimeter_mean','area_mean', 'concave points_mean',
                                  'perimeter_se', 'area_se','perimeter_worst'])

In [None]:
# a feature list made in order to compare each feature in the 3 cases of 'mean', 'se', and 'worst'
features_cat = []
columns_list = np.array(allfeatures)
gen = ([i, i+10, i+20] for i in range(10))
for x in gen:
    features_cat.append(columns_list[x].tolist())

In [None]:
# let's try it out for the radius
df[features_cat[3]].describe()

In [None]:
plt.figure(figsize=(16,24))
for i in range(10):
    plt.subplot(5,2,i+1)
    sns.heatmap(df[feature_list[i]].corr(), annot=True)

above heatmaps shows correlation between 'mean' case and the 'worst' case of the following features:
* radius

In [None]:
df[feature_list[0]].head()

In [None]:
plt.figure(figsize=(15,15))
sns.pairplot(pd.concat([data.diagnosis,df[feature_mean]], axis = 1), hue= 'diagnosis', palette='coolwarm')

In [None]:
# Some remarks:
# 1. Database is biased to having more benign(zeros) than malignant(ones) tumors
# 2. 

# Train Test Split

In [None]:
df.head()

In [None]:
df.columns

In [None]:
X = df[df.columns[1:]]

In [None]:
X.shape

In [None]:
y = df['diagnosis']

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)