# Blackstraw Data Science Technical Interview

### Context: We have Brest Cancer Prediction
* Number of Samples: 569  
* Number of Features: 30 numeric, predictive attributes  
* Number of Classes: 2 


## Setup

In [2]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from collections import OrderedDict

from sklearn import datasets
from sklearn.preprocessing import label_binarize, LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
DISPLAY_PRECISION = 4
pd.set_option("display.precision", DISPLAY_PRECISION)

## Get the breast cancer dataset from sklearn

In [3]:
dat = datasets.load_breast_cancer()
# print(dat.DESCR)  # detailed description

## Describe the feature statistics

In [5]:
print("The sklearn breast cancer dataset keys:")
print(dat.keys()) # dict_keys(['target_names', 'target', 'feature_names', 'data', 'DESCR'])
print("---")

# Note that we need to reverse the original '0' and '1' mapping in order to end up with this mapping:
# Benign = 0 (negative class)
# Malignant = 1 (positive class)

li_classes = [dat.target_names[1], dat.target_names[0]]
li_target = [1 if x==0 else 0 for x in list(dat.target)]
li_ftrs = list(dat.feature_names)

print("There are 2 target classes:")
print("li_classes", li_classes)
print("---")
print("Target class distribution from a total of %d target values:" % len(li_target))
print(pd.Series(li_target).value_counts())
print("---")

df_all = pd.DataFrame(dat.data[:,:], columns=li_ftrs)
print("Describe dataframe, first 6 columns:")
print(df_all.iloc[:,:6].describe().to_string())

The sklearn breast cancer dataset keys:
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
---
There are 2 target classes:
li_classes ['benign', 'malignant']
---
Target class distribution from a total of 569 target values:
0    357
1    212
dtype: int64
---
Describe dataframe, first 6 columns:
       mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness
count     569.0000      569.0000         569.000   569.0000         569.0000          569.0000
mean       14.1273       19.2896          91.969   654.8891           0.0964            0.1043
std         3.5240        4.3010          24.299   351.9141           0.0141            0.0528
min         6.9810        9.7100          43.790   143.5000           0.0526            0.0194
25%        11.7000       16.1700          75.170   420.3000           0.0864            0.0649
50%        13.3700       18.8400          86.240   551.1000           0.0959  

In [12]:
# Setup X and y
train_df = df_all.copy()
train_df['target'] = li_target

## Question: (2 min)
#### 1.) What is the most correlated variable to target?

In [29]:
train_df.corr()['target'].sort_values()

smoothness error          -0.0670
mean fractal dimension    -0.0128
texture error             -0.0083
symmetry error            -0.0065
fractal dimension error    0.0780
concavity error            0.2537
compactness error          0.2930
worst fractal dimension    0.3239
mean symmetry              0.3305
mean smoothness            0.3586
concave points error       0.4080
mean texture               0.4152
worst symmetry             0.4163
worst smoothness           0.4215
worst texture              0.4569
area error                 0.5482
perimeter error            0.5561
radius error               0.5671
worst compactness          0.5910
mean compactness           0.5965
worst concavity            0.6596
mean concavity             0.6964
mean area                  0.7090
mean radius                0.7300
worst area                 0.7338
mean perimeter             0.7426
worst radius               0.7765
mean concave points        0.7766
worst perimeter            0.7829
worst concave 

## Question: (8 min)

#### 2.) Of the top five most correlated variable what is the quatitative difference, with tolerance, between Benign(Negative) and Malignant(Positive)?


In [35]:
top_corr_columns = ['mean perimeter','worst radius','mean concave points','worst perimeter','worst concave points']

In [36]:
train_df.groupby(['target']).agg('mean')[top_corr_columns]

Unnamed: 0_level_0,mean perimeter,worst radius,mean concave points,worst perimeter,worst concave points
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,78.0754,13.3798,0.0257,87.0059,0.0744
1,115.3654,21.1348,0.088,141.3703,0.1822


In [38]:
train_df.groupby(['target']).agg('std')[top_corr_columns]

Unnamed: 0_level_0,mean perimeter,worst radius,mean concave points,worst perimeter,worst concave points
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,11.8074,1.9814,0.0159,13.5271,0.0358
1,21.8547,4.2836,0.0344,29.4571,0.0463


## Question:(10 min)
#### 3.) Create an Logistic regression model based approach to predict target based on optimal recall score.

In [40]:
TEST_SIZE_RATIO = 0.2  

# Setup X and y
X = df_all
y = pd.Series(li_target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE_RATIO, random_state=0)
print("X_train_0.shape, y_train.shape", X_train.shape, y_train.shape)
print("X_test_0.shape, y_test.shape", X_test.shape, y_test.shape)

X_train_0.shape, y_train.shape (455, 30) (455,)
X_test_0.shape, y_test.shape (114, 30) (114,)


In [41]:
# Train data
print("X_train.shape, y_train.shape", X_train.shape, y_train.shape)

## Print the class distribution of the TARGET for both train and test sets
val_cnts = y_train.value_counts()
print("Class distribution of positive and negative samples in the train set:")
print(val_cnts)
print("Percentage of positive class samples: %s" % "%2f%%" % (100 * val_cnts[1] / len(y_train)))

print("---")
print("X_test.shape, y_test.shape", X_test.shape, y_test.shape)

val_cnts = y_test.value_counts()
print("Class distribution of positive and negative samples in the test set:")
print(val_cnts)
print("Percentage of positive class samples: %s" % "%2f%%" % (100 * val_cnts[1] / len(y_test)))

X_train.shape, y_train.shape (455, 30) (455,)
Class distribution of positive and negative samples in the train set:
0    290
1    165
dtype: int64
Percentage of positive class samples: 36.263736%
---
X_test.shape, y_test.shape (114, 30) (114,)
Class distribution of positive and negative samples in the test set:
0    67
1    47
dtype: int64
Percentage of positive class samples: 41.228070%


Note that in some cases it is easier to train a model if we maintain an exact balance of train and test target values (a stratified sample). Here the positive to negative class ratio happens to already be quite well balanced in the train and test sets.

In [42]:
clf_lr = LogisticRegression()

# Fit the model
clf_lr.fit(X_train, y_train)

# Print Logistic Regression specific attributes
print("intercept_:")
print(clf_lr.intercept_ )
print()
print("coef_:")
print(clf_lr.coef_)

intercept_:
[-0.37268061]

coef_:
[[-1.7095485  -0.07141071 -0.10394832  0.00582353  0.07591408  0.32593914
   0.44768829  0.19695951  0.14338856  0.02191844 -0.06833501 -0.74804799
  -0.31702196  0.12359283  0.00467451  0.05736809  0.08483994  0.02429963
   0.03065546  0.00333667 -1.72718691  0.28367766  0.30229209  0.02137787
   0.13095958  1.0351117   1.31489149  0.39514903  0.39740857  0.10446571]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [45]:
y_pred = clf_lr.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.91      0.95        67
           1       0.88      0.98      0.93        47

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.94       114
weighted avg       0.94      0.94      0.94       114



