In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer # sklearn comes loaded with dataset, no need for 'read_csv()''

# data preperation
cancer_data = load_breast_cancer() # instantiate dataset, dictionary-like object
print(cancer_data.keys()) # show keys
print(cancer_data['DESCR']) # 10 measurements X mean, standard error, worst value = 30 total features. Process of figuring out what additional features to calculate based on raw data is called feature engineering


dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for ea

In [5]:
df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names']) # create pandas DataFrame from dataset 
df['target'] = cancer_data['target'] # add 'target' column to DataFrame
print(df.head()) # check first five datapoints of DataFrame


   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [7]:
print(cancer_data['target_names']) # check what 0 and 1 correspond to in 'target' data. 0 = malignant 1 = benign
print()

## building logistic regression model
from sklearn.linear_model import LogisticRegression # import model
model = LogisticRegression (solver='liblinear') # instantiate model, set solver algorithm to 'liblinear' because default 'lbfgs' doesn't work for some reason
X = df[cancer_data.feature_names].values # 'feature_names' is a dictionary key in the original dataset that passes list of names of all 30 features 
y = df['target'].values
model.fit(X, y) # fit

## predict
print("prediction for datapoint 0:", model.predict([X[0]])) # prediction for first datapoint. passed as [X[0]] with double brackets because model expects features to be passed as 2d array. x[0] with single brackes would be a 1d array and cause an error  
print()
print(model.predict(X[:25])) # predictions for first 25 datapoints
print(y[:25]) # first 25 actual target values 
print(model.score(X, y)) # accurary score of model

# code and comments by github.com/alandavidgrunberg


['malignant' 'benign']

prediction for datapoint 0: [0]

[0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0]
0.9595782073813708
