In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# Baseline model

- **Proportion of largest class represented**    
- **Logistic Regression**

In [2]:
df = pd.read_csv('../dataset/HAM10000_metadata.csv')
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [3]:
df = df.dropna()

In [4]:
df.dx.value_counts()

nv       6660
mel      1111
bkl      1089
bcc       514
akiec     327
vasc      142
df        115
Name: dx, dtype: int64

## Proportion of 'nv' label

In [5]:
df[df['dx'] == 'nv'].shape[0] / df.shape[0]

0.668808997790721

## Logistic Regression

In [6]:
scaler = StandardScaler()

df['age'] = scaler.fit_transform(df[['age']])

In [7]:
dummies = pd.get_dummies(df[['dx_type', 'sex', 'localization']])
dummies

Unnamed: 0,dx_type_confocal,dx_type_consensus,dx_type_follow_up,dx_type_histo,sex_female,sex_male,sex_unknown,localization_abdomen,localization_acral,localization_back,...,localization_face,localization_foot,localization_genital,localization_hand,localization_lower extremity,localization_neck,localization_scalp,localization_trunk,localization_unknown,localization_upper extremity
0,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10010,0,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10011,0,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10012,0,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10013,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [8]:
df['index'] = 1
dummies['index'] = 1

merged_df = pd.concat([df, dummies], axis=1)
merged_df.drop('index', axis=1, inplace=True)

merged_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dx_type_confocal,dx_type_consensus,dx_type_follow_up,...,localization_face,localization_foot,localization_genital,localization_hand,localization_lower extremity,localization_neck,localization_scalp,localization_trunk,localization_unknown,localization_upper extremity
0,HAM_0000118,ISIC_0027419,bkl,histo,1.658214,male,scalp,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,HAM_0000118,ISIC_0025030,bkl,histo,1.658214,male,scalp,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,HAM_0002730,ISIC_0026769,bkl,histo,1.658214,male,scalp,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,HAM_0002730,ISIC_0025661,bkl,histo,1.658214,male,scalp,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,HAM_0001466,ISIC_0031633,bkl,histo,1.363537,male,ear,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X = merged_df.drop(['lesion_id', 'image_id', 'dx', 'dx_type', 'sex', 'localization'], axis=1)
y = merged_df['dx']

In [10]:
log_model = LogisticRegression(random_state=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

log_model.fit(X_train, y_train)

log_model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7128514056224899

In [11]:
def clean_df():
    pass   # return df

def baseline_model(df, model='logistic regression', test_size=0.2):
    
    df = clean_df(df)
    
    X = df.drop(['lesion_id', 'image_id', 'dx', 'dx_type', 'sex', 'localization'], axis=1)
    y = df['dx']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    if model == 'logistic regression':
        log_model = LogisticRegression()
        log_model.fit(X_train, y_train)
        return log_model.score(X_test, y_test)
    if model == 'nearest neighbors':
        knn_model = KNeighborsClassifier()
        knn_model.fit(X_train, y_train)
        return knn_model.score(X_test, y_test)

# Model building

- **VGG16**   
- **Resnet**   
- **Inception**   
- **Densenet**

In [26]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121

from tensorflow.data import Dataset 

In [27]:
def build_model(selection='vgg16'):
    '''
    Use VGG16, Resnet or Densenet as base models depending on input
    Set VGG16 model as default if no parameter is added
    Freeze some of the layers of each model and experiment with the final 3-4 layers
    '''
    #Specify input shape of the data
    input_shape = (75, 100, 3)

    # Implement VGG16 model
    if selection == 'vgg16':
        model = VGG16(weights='imagenet',
                      input_shape=input_shape,
                      include_top=False)

    # Implement ResNet model
    if selection == 'resnet':
        model = ResNet50(weights='imagenet',
                         input_shape=input_shape,
                         include_top=False,
                         classes=7)

    # Implement DenseNet model
    if selection == 'densenet':
        model = DenseNet121()

    # Make pre-trained layers non iterable and add final layers
    for layer in model.layers:
        layer.iterable = False
    
    x = model.layers.output
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    output = Dense(6, activation='softmax')(x)
    
    model = Model(inputs=model.input, outputs=output)

    return model

In [28]:
def compile_model(model):
    '''
    Compile the model with adam optimizer, choose accuracy as the metrics
    '''
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [29]:
model = build_model()
model.summary()

AttributeError: 'list' object has no attribute 'output'

In [30]:
def fit_model():
    '''
    Fit the model using early stopping criteria
    '''
    es = EarlyStopping(monitor='val_loss', mode='min', patience=5, verbose=1, restore_best_weights=True)

    history = model.fit(X_train, y_train,
                      validation_split=0.2,
                      epochs=100,
                      batch_size=16,
                      callbacks=[es])
    return history