In this workshop, we will do basic visualisation and analysis of the UCI breast cancer dataset. We will then apply various machine learning methods on the said dataset.

This dataset helps you classify a patient as having breast cancer or not.

In [None]:
#importing the libraries
import numpy as np
import pandas as pd

from urllib.request import urlopen # Used for opening link

import matplotlib.pyplot as plt # Install if not installed
import seaborn as sns # Install if not installed

import sklearn # Install if not installed

# Exploring the data

We will use the UCI breast cancer dataset, which helps us predict whether or not a person has cancer. 

In [None]:
# Loading dataset using a link

UCI_data_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'

column_names = ['id_number', 'diagnosis', 'radius_mean', 
         'texture_mean', 'perimeter_mean', 'area_mean', 
         'smoothness_mean', 'compactness_mean', 'concavity_mean',
         'concave_points_mean', 'symmetry_mean', 
         'fractal_dimension_mean', 'radius_se', 'texture_se', 
         'perimeter_se', 'area_se', 'smoothness_se', 
         'compactness_se', 'concavity_se', 'concave_points_se', 
         'symmetry_se', 'fractal_dimension_se', 
         'radius_worst', 'texture_worst', 'perimeter_worst',
         'area_worst', 'smoothness_worst', 
         'compactness_worst', 'concavity_worst', 
         'concave_points_worst', 'symmetry_worst', 
         'fractal_dimension_worst'] 

df_breast_cancer = pd.read_csv(urlopen(UCI_data_URL), names=column_names)


### YOUR TURN!

In [None]:
# View the first 5 rows in the datasets


In [None]:
# Drop the id_number column

In [None]:
# Display a description of the dataframe using describe()


In [None]:
# Use .shape to display the dimensions of our data frame
# Expected output: (569, 31)


In [None]:
# Use .dtypes to display the data types of our columns


In [None]:
# Display the number of NA values in each column using isnull().sum()


# Visualization

Let's visually explore our dataset. 

## Heatmap

As a first example, we will try and visualize the relationship between our mean features.

This heat map shows the correlation between features, with 1 signifying a high correlation.

In [None]:
#draw a heatmap between mean features and diagnosis using Matplotlib and Seaborn
features_mean = ['radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean', 'compactness_mean', 'concavity_mean','concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean']
plt.figure(figsize=(15,15))
heat = sns.heatmap(df_breast_cancer[features_mean].corr(), vmax=1, square=True, annot=True)

## Histograms

Now let's plot the radius_feature as a histogram.

In [None]:
# First we split the dataset into malignant and benign.
dataMalignant=df_breast_cancer[df_breast_cancer['diagnosis'] =='M']
dataBenign=df_breast_cancer[df_breast_cancer['diagnosis'] =="B"]


In [None]:
#Plotting as a histogram
feature = 'radius_mean'
fig, ax = plt.subplots()
figsize=(15,15)

binwidth= (max(df_breast_cancer[feature]) - min(df_breast_cancer[feature]))/250
ax.hist(x = (dataMalignant[feature],dataBenign[feature]), 
        bins=np.arange(min(df_breast_cancer[feature]), max(df_breast_cancer[feature]) + binwidth, binwidth) , 
        alpha=0.5,stacked=True, normed = True, label=['M','B'],color=['r','g'])
ax.legend(loc='upper right')
ax.set_title(feature)
plt.show()

In [None]:
# YOUR TURN

# Now code a loop which goes through all the other "mean" features and plots their histograms
features_mean = ['texture_mean','perimeter_mean','area_mean','smoothness_mean', 'compactness_mean', 'concavity_mean','concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']


## Strip plots

The **strip plot** is another way of visualizing data. It plots the distribution of variables for each category as individual datapoints. For vertical strip plots (the default), distributions of continuous values are laid out parallel to the y-axis and the distinct categories are spaced out along the x-axis.

Let's visualize "radius_mean" using strip plots.

In [None]:
plt.figure()
plt.figure(figsize=(15,15))
sns.stripplot(x='diagnosis', y= 'radius_mean', data= df_breast_cancer, jitter=True, palette = 'Set1');
plt.title('Diagnosis vs radius_mean')

In [None]:
# YOUR TURN

# Now code a loop which goes through all the other mean features and displays their strip plots against the diagnosis

# ML Algorithms

## Logistic Regression (LR)
Don’t get confused by its name! It is a classification not a regression algorithm. 
In simple words, it predicts the probability of occurrence of an event based on some dependent variables. 

Since, it predicts the probability, its output values lies between 0 and 1 (as expected).

**sklearn** provides us with tools to implement LR, so we'll use it.

In [None]:
# Importing sklearn stuff
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [None]:
#
Y = df_breast_cancer.pop('diagnosis')
X = df_breast_cancer
X.head()

In [None]:
Y

In [None]:
# Let's drop "id_number" column


In [None]:
seed = 7
model = LogisticRegression()
scoring = 'accuracy'

kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

msg = "%s: %f (%f)" % ('Logistic Regression', cv_results.mean(), cv_results.std())
print(msg)

In [None]:
# YOUR TURN!

# Code a loop that runs every model below on our dataset and displays its accuracy

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('ANN', MLPClassifier()))

In [None]:
# HOMEWORK: Display a boxplot showing the performance of each algorithm

# Feature selection and feature engineering

Feature Selection is the process where you automatically or manually select those features which contribute most to your prediction variable or output in which you are interested in. Having irrelevant features in your data can decrease the accuracy of the models and make your model learn based on irrelevant features.

Example:
Correlated features in general don't improve models (although it depends on the specifics of the problem like the number of variables and the degree of correlation), but they MIGHT affect specific models negatively.

Let's see it for ourselves.
We can see on the heatmap that __radius_mean__ and __perimeter_mean__ are highly correlated. Let's drop pne of them and re-run our LR model.

In [None]:
new_df = df_breast_cancer.drop("perimeter_mean", axis = 1)
new_df.head(2)

In [None]:
# Let's run our model on the new dataset
X = new_df
model = LogisticRegression()
scoring = 'accuracy'

kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

msg = "%s: %f (%f)" % ('Logistic Regression', cv_results.mean(), cv_results.std())
print(msg)

In [None]:
# YOUR TURN 

# Drop one column for every pair of columns with a correlation greater than 0.90

# Run an algorithm of your choice on the new dataset. Did it help us or not?

Feature engineering is the process of using domain knowledge of the data to create features that make machine learning algorithms work. 

Let's try and create a feature which tells us if whether "radius_worst" is greater than the average *radius_worst* or not.

In [None]:
# Calculate the average radius_worst
mean_radius_worst = df_breast_cancer["radius_worst"].mean()
mean_radius_worst

In [None]:
# Let's create a column where the value is 1 if "radius_worst" > mean, and 0 otherwise
new_df_2 = df_breast_cancer.drop("radius_worst", axis = 1)
new_df_2 ["radius_worst_mean"] = np.where(df_breast_cancer['radius_worst']>=mean_radius_worst, 1, 0)


In [None]:
new_df_2.tail()

In [None]:
# Let's run our model on the new dataset
X = new_df_2
model = LogisticRegression()
scoring = 'accuracy'

kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

msg = "%s: %f (%f)" % ('Logistic Regression', cv_results.mean(), cv_results.std())
print(msg)

# Hyperparameter tuning

Wikipedia states that “hyperparameter tuning is choosing a set of optimal hyperparameters for a learning algorithm”. So what is a hyperparameter?

A hyperparameter is a parameter whose value is set before the learning process begins.
Examples of hyperparameters: penalty in logistic regression, depth in decision tree.

In [None]:
# Import
from sklearn.grid_search import GridSearchCV

# make an array of depths to choose from, say 1 to 50
depths = np.arange(1, 500, 10)

# Create a parameter grid: map the parameter names to the values that should be searched
# Simply a python dictionary
# Key: parameter name
# Value: list of values that should be searched for that parameter
# Single key-value pair for param_grid
param_grid = [{'max_depth':depths}]

# instantiate the grid
model = DecisionTreeClassifier(random_state=seed)
grid = GridSearchCV(model, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
X_train = df_breast_cancer
grid.fit(X_train, Y)

In [None]:
# examine the best model

# Single best score achieved across all params (min_samples_split)
print(grid.best_score_)

# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid.best_estimator_)

In [None]:
# Try tuning the "max_features" hyperparameter

In [None]:
# Try tuning the "max_features" AND "max_depth" hyperparameters at the same time

# Conclusion

<img src="concl.png">