In [137]:
import numpy 
import pandas as pd 
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer() 

Question 0: How many features does the breast cancer dataset have? This function should return an integer.

In [138]:
print(cancer.feature_names.size)

30


Question 1: Scikit-learn works with lists, numpy arrays, scipy-sparse matrices, and pandas DataFrames, so converting the dataset to a DataFrame is not necessary for training this model. Using a DataFrame does however help make many things easier such as munging data, so let's practice creating a classifier with a pandas DataFrame.

In [139]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target # we add correct outcomes/targets/labels to our data frame
print(df)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

Question 2: What is the class distribution? (i.e. how many instances of malignant (encoded 0) and how many benign (encoded 1)?) This function should return a Series named target of length 2 with integer values and index = ['malignant', 'benign']

In [140]:
label_map = {0: 'Malignant', 1: 'Benign'}
target_series = df['target'].map(label_map).value_counts()
print(target_series)

target
Benign       357
Malignant    212
Name: count, dtype: int64


Question 3: Split the DataFrame into X (the data) and y (the labels). This function should return a tuple of length 2: (X, y), where X, a pandas DataFrame, has shape (569, 30) y, a pandas Series, has shape (569,).


In [141]:
def features_labels(data):
    X = data.drop('target', axis=1) # axis 1 means we're dropping the column of 'target'
    y = data['target']
    return X, y

print(features_labels(df))

(     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  me

Question 4: Using train_test_split, split X and y into training and test sets (X_train, X_test, y_train, and y_test).

In [142]:
from sklearn.model_selection import train_test_split
X, y = features_labels(df)

# Randomly split the dataset into training and testing sets.
# 20% of data goes into testing state and 80% to training set.
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=.20)

print("Training Features X Shape:", x_train.shape) # 455 samples/observations and 30 features per observation
print("Test Features X Shape:", x_test.shape, end='\n\n')
print("Training Labels Y Shape:", y_train.shape)
print("Test Labels Y Shape:", y_test.shape) # 114 samples/observations and 1 feature per observation (one dimensional)

Training Features X Shape: (455, 30)
Test Features X Shape: (114, 30)

Training Labels Y Shape: (455,)
Test Labels Y Shape: (114,)


Question 5: Using KNeighborsClassifier, fit a k-nearest neighbors (knn) classifier with X_train, y_train and using one nearest neighbor (n_neighbors = 1). This function should return a sklearn.neighbors.classification.KNeighborsClassifier.

In [143]:
from sklearn.neighbors import KNeighborsClassifier

def fit_knn(X_train, y_train, n):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    return knn # classifier model
    
print(fit_knn(x_train, y_train,1))

KNeighborsClassifier(n_neighbors=1)


Question 6: Using your knn classifier, predict the class label using the mean value for each feature. Hint: You can use cancerdf.mean()[:-1].values.reshape(1, -1) which gets the mean value for each feature, ignores the target column, and reshapes the data from 1 dimension to 2 (necessary for the precict method of KNeighborsClassifier). This function should return a numpy array either array([ 0.]) or array([ 1.])


In [144]:
# data frame must contain both features and labels
def predict_labels(df, trained_knn):    
    mean_values = df.mean()[:-1].values.reshape(1, -1)
    mean_df = pd.DataFrame(mean_values, columns=cancer.feature_names)
    prediction = trained_knn.predict(mean_df)
    return prediction

print(predict_labels(df, fit_knn(x_train, y_train,1)))

[1]
