In [1]:
import os 
import pandas as pd
import numpy as np

# Get dataframe of image filenames
files = []
file_names = []
for file_name in os.listdir('images'):
    # Remove 'S_' and file extensions
    num = file_name.strip('S_').strip('.gif').strip('.GIF')
    
    # Find and replace letters
    num = num.replace("A", "01")
    num = num.replace("B", "02")
    num = num.replace("L", "01")
    num = num.replace("R", "02")
    num = num.replace("M", "02")
    num = num.replace("X", "01")
    num = num.replace("Y", "02")
    num = num.replace("Z", "03")
    num = num.replace("I", "01")
    num = num.replace("O", "02")
    num = num.replace("U", "02")
    num = num.replace("S", "02")
    
    files.append(int(num))
    file_names.append(file_name)
    
df = pd.DataFrame()
df['id'] = files
df['file_name'] = file_names

# Get dataframe of Wisconsin csv
data = pd.read_csv('data.csv')

# Merge dataframes on ID
df_merged = df.merge(data, on='id')

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(data[data['id_join'].str.endswith(('01', '02', '03'))][['id', 'diagnosis']])
#     print(df_merged)


In [2]:
# Traditional machine learning benchmark
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier, plot_importance
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

# Store labels
y = df_merged["diagnosis"]

# Drop useless columns
df = df_merged.drop(columns=['id', 'diagnosis', 'file_name', 'Unnamed: 32'])

# model = LogisticRegression()
# model = GaussianNB()
model = XGBClassifier()

scores = cross_val_score(model, df, y, cv=5)
print(scores)
print(np.mean(scores), np.std(scores))
df

[0.94642857 0.96396396 0.98198198 0.98181818 0.96363636]
0.9675658125658126 0.013312796964389143


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,16.26,21.88,107.50,826.8,0.11650,0.12830,0.17990,0.07981,0.1869,0.06532,...,17.730,25.21,113.70,975.2,0.14260,0.21160,0.3344,0.1047,0.2736,0.07953
1,16.26,21.88,107.50,826.8,0.11650,0.12830,0.17990,0.07981,0.1869,0.06532,...,17.730,25.21,113.70,975.2,0.14260,0.21160,0.3344,0.1047,0.2736,0.07953
2,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
3,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
4,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
550,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
551,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
552,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [3]:
from PIL import Image

def file_to_array(file_name):
    """For each string `file_name`, open that image and return a numpy array."""
    img = Image.open('images/'+ file_name)
    img = img.convert("RGB")
    return np.array(img)

df_merged['image'] = df_merged['file_name'].apply(file_to_array)

In [6]:
# CNN time - split the data.
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils

df_cnn = df_merged[['diagnosis', 'image']]
# Cast labels as integers, 1 for Malignant, 0 for benign
df_cnn['diagnosis'] = (df_cnn['diagnosis'] == 'M').astype(int) 

def create_model():
    # Specify model
#     model = Sequential()
#     model.add(Conv2D(32, (5, 5), input_shape=(480, 640, 3), padding='same', activation='relu'))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Conv2D(32, (5, 5), activation='relu', padding='same'))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Conv2D(32, (5, 5), activation='relu', padding='same'))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Flatten())
#     model.add(Dense(120, activation='relu'))


    model = Sequential()
    model.add(Conv2D(32, (3, 3), padding='same', input_shape=(480, 640, 3)))
    model.add(Activation('relu'))
    model.add(Conv2D(32, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Conv2D(64, (3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    return model

#     model = Sequential()
#     model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(480, 640, 3)))
#     model.add(Conv2D(64, (3, 3), activation='relu'))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Dropout(0.25))
#     model.add(Flatten())
#     model.add(Dense(128, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(1, activation='sigmoid'))
#     return model
    
    
#     model = Sequential()
#     model.add(Dense(100, input_dim=921600, activation='relu'))
#     model.add(Dense(100, activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))
    

for _ in range(1):
    # Split data
    df_train, df_test = train_test_split(df_cnn, test_size=0.2)
    df_train_X, df_train_y = df_train['image'], df_train['diagnosis']
    df_test_X, df_test_y = df_test['image'], df_test['diagnosis']

    train_X = np.stack(df_train_X.values)
    train_y = df_train_y.values
    test_X = np.stack(df_test_X.values)
    test_y = df_test_y.values
    
    train_X = train_X.astype("float32")
    test_X = test_X.astype("float32")
    train_X = train_X / 255
    test_X = test_X / 255
    
    # Create and compile model
    model = create_model()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
#     # Reshape if necessary
#     train_X = train_X.reshape(train_X.shape[0], train_X.shape[1]*train_X.shape[2]*train_X.shape[3])
#     test_X = test_X.reshape(test_X.shape[0], test_X.shape[1]*test_X.shape[2]*test_X.shape[3])

    # Train model and output cross-validated accuracy
    model.fit(train_X, train_y, validation_data=(test_X, test_y), epochs=12, batch_size=12, verbose=1)
    scores = model.evaluate(test_X, test_y, verbose=0)
    print(scores)
    print(test_y)
    print(model.predict_classes(test_X, verbose=1))

    #     print("Accuracy: %.2f%%" % (scores[1]*100))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


ValueError: could not convert string to float: '[[[240 248 248]\n  [240 248 248]\n  [240 248 248]\n  ...\n  [232 248 248]\n  [232 248 248]\n  [232 248 248]]\n\n [[240 248 248]\n  [240 248 248]\n  [240 248 248]\n  ...\n  [232 248 248]\n  [232 248 248]\n  [232 248 248]]\n\n [[208 248 248]\n  [232 248 248]\n  [208 248 248]\n  ...\n  [208 248 248]\n  [208 248 248]\n  [208 248 248]]\n\n ...\n\n [[240 248 248]\n  [240 248 248]\n  [232 248 248]\n  ...\n  [232 248 248]\n  [232 248 248]\n  [232 248 248]]\n\n [[240 248 248]\n  [232 248 248]\n  [208 248 248]\n  ...\n  [232 248 248]\n  [208 248 248]\n  [208 248 248]]\n\n [[232 248 248]\n  [232 248 248]\n  [232 248 248]\n  ...\n  [208 248 248]\n  [208 248 248]\n  [208 248 248]]]'

In [None]:
sum(test_y == 0)/len(test_y)