##### Auteur: Antoine Cadiou

# Attention: pour fonctionner il faut récupérer les données sur ces liens: 
### www.kaggle.com/dataset/5f79f0a1c3a7a203f986c8f7a9328af957ffef60b84bef4ef7216a39f5ca941a
### https://www.kaggle.com/arroqc/siic-isic-224x224-images
### www.kaggle.com/dataset/40f0f72b4735520283e9bb32bd11f40241ebb9f7aa6d236ee7736d561d01baaa
#### et changer les chemins des données dans le code

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import cv2, os
import seaborn as sns
from skimage import measure

from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, f1_score, roc_curve, roc_auc_score, confusion_matrix
import xgboost as xgb

from keras.models import Sequential, Model
from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, UpSampling2D, Reshape, core, Dropout, Dense, Flatten
import tensorflow as tf

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score
import pickle

In [None]:
path = "../input/tb3-ipr-melanoma-2020/data/PROJECT_Data/"
files = os.listdir(path)
labels = pd.read_csv("../input/tb3-ipr-melanoma-2020/ISIC-2017_Data_GroundTruth_Classification.csv")
#labels.head()
dataBIG = pickle.load(open("../input/isic224segmented/labels3.pickle", "rb"))
imgBIG = pickle.load(open("../input/isic224segmented/imgs3.pickle", "rb"))


names = []
for f in files:
    split = f.split('_')
    if len(split)==2:
        names.append(split[0]+"_"+split[1].split('.')[0])
names=np.array(names)
names.sort()
names = pd.DataFrame(names)
names.columns = ['image_id']

data = names.join(labels, rsuffix="_bin_")
data['image_path']=path+data['image_id']+'.jpg'
data['segmentation_path']=path+data['image_id']+'_segmentation.png'
#data['superpixels_path']=data['image_id']+'_superpixels.png'
data['label']=data['melanoma'].astype('int32')

data = data.drop(['image_id', 'image_id_bin_', 'seborrheic_keratosis', 'melanoma'], axis=1)
#data.head()

In [None]:
def foldHorizontal(img, cx):
    gauche = img[:,:cx]
    droite = img[:,cx:]
    l,cg = gauche.shape
    l,cd = droite.shape
    #on met les 2 folds aux mêmes dimensions en rajoutant du vide
    if cg>cd:
        droite = np.hstack((droite, np.zeros((l, cg-cd))))
    else:
        gauche = np.hstack((np.zeros((l, cd-cg)), gauche))
    #on replie le gauche sur le droite
    gauche_flip = cv2.flip(gauche, 1)
    res = abs(droite-gauche_flip)
    return np.sum(res)

def foldVertical(img, cy):
    haut = img[:cy,:]
    bas = img[cy:,:]
    lh,c = haut.shape
    lb,c = bas.shape
    #on met les 2 folds aux mêmes dimensions en rajoutant du vide
    if lh>lb:
        bas = np.vstack((bas, np.zeros((lh-lb, c))))
    else:
        haut = np.vstack((np.zeros((lb-lh, c)), haut))
    #on replie le haut sur le bas
    haut_flip = cv2.flip(haut, 0)
    res = abs(haut_flip-bas)
    return np.sum(res)
    
def getAsymmetry(img, cx, cy, A):
    Ax = foldHorizontal(img, cx)
    Ay = foldVertical(img, cy)
    A1 = (min(Ax,Ay)/A)*100
    A2 = (Ax + Ay)/A*100
    return A1,A2

def getBorderIrregularity(P, SD, GD):
    return P * ((1/SD) - (1/GD))

def getColorFeatures(imgcol, imgseg):
    posL = np.argwhere(imgseg == 1)
    Bl, Gl, Rl = np.mean(imgcol[posL[:,0],posL[:,1],:], axis=0)
    posS = np.argwhere(imgseg == 0)
    Bs, Gs, Rs = np.mean(imgcol[posS[:,0],posS[:,1],:], axis=0)

    F1 = Rl/(Rl+Gl+Bl)
    F2 = Gl/(Rl+Gl+Bl)
    F3 = Bl/(Rl+Gl+Bl)
    F4 = Rl/Rs
    F5 = Gl/Gs
    F6 = Bl/Bs
    F7 = F4/(F4+F5+F6)
    F8 = F5/(F4+F5+F6)
    F9 = F6/(F4+F5+F6)
    F10 = Rl-Rs
    F11 = Gl-Gs
    F12 = Bl-Bs
    F13 = F10/(F10+F11+F12)
    F14 = F11/(F10+F11+F12)
    F15 = F12/(F10+F11+F12)
    return [F4,F5,F6,F10,F11,F12,F13,F14,F15]
    #return [F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15]

In [None]:
X = []
N = len(data)

for p in range(N):
    psegment = data.segmentation_path[p]
    pcolor = data.image_path[p]
    # chargement des images
    imgcol = cv2.imread(pcolor)
    imgseg = cv2.imread(psegment)
    imgseg = cv2.cvtColor(imgseg, cv2.COLOR_BGR2GRAY)/255.
    label_imgseg = measure.label(imgseg)
    props = measure.regionprops_table(label_imgseg, properties=['area', 'extent', 'perimeter', 'solidity', \
                                                                'major_axis_length', 'minor_axis_length', 'centroid'])
    
    #Region Properties
    x = (np.array([props['extent'], \
                   props['solidity'], \
                   (props['minor_axis_length']/props['major_axis_length']),\
                   ((4*props['area'])/(np.pi * props['major_axis_length']**2)),\
                   ((np.pi*props['minor_axis_length'])/props['perimeter']),\
                   ((4*np.pi*props['area'])/props['perimeter']**2),\
                   (props['perimeter']/(np.pi * props['major_axis_length']))
                  ]).T)[0]
         
    #Asymmetry
    A1, A2 = getAsymmetry(imgseg, props['centroid-1'][0], props['centroid-0'][0], props['area'][0])
    #Border Irregularity
    B = getBorderIrregularity(props['perimeter'][0], props['minor_axis_length'][0], props['major_axis_length'][0])
    #Color Features
    CD = getColorFeatures(imgcol, imgseg)
    
    x = np.hstack((x, A1, A2, B, CD))
    if len(X)==0:
        X.append(x)
    else:
        X = np.vstack((X, x))
    
df = pd.DataFrame(X)
df.columns = ['extent', 'solidity', 'd/D', '4A/(pi*d^2)', 'pi*d/P', '4*pi*A/P^2', 'P/(pi*D)','A1', 'A2', 'B'] +\
             ['F'+str(i) for i in range(1,len(CD)+1)]
df.head()

In [None]:
X = []
i = 0
while i<len(dataBIG):
    imgcol = cv2.imread("../input/siic-isic-224x224-images/train/"+dataBIG["filename"][i])
    imgseg = imgBIG[i]
    imgseg = cv2.cvtColor(imgseg.astype('uint8'), cv2.COLOR_BGR2GRAY)/255.
    #si on a une segmentation vide on la skip
    if(np.all(imgseg==0)):
        imgBIG = np.vstack((imgBIG[:i],imgBIG[i+1:]))
        dataBIG = pd.concat([dataBIG.iloc[:i,:], dataBIG.iloc[i+1:,:]], ignore_index=True)
        continue
    
    label_imgseg = measure.label(imgseg)
    props = measure.regionprops_table(label_imgseg, properties=['area', 'extent', 'perimeter', 'solidity', \
                                                                'major_axis_length', 'minor_axis_length', 'centroid'])
    
    #Region Properties
    x = (np.array([props['extent'], \
                   props['solidity'], \
                   (props['minor_axis_length']/props['major_axis_length']),\
                   ((4*props['area'])/(np.pi * props['major_axis_length']**2)),\
                   ((np.pi*props['minor_axis_length'])/props['perimeter']),\
                   ((4*np.pi*props['area'])/props['perimeter']**2),\
                   (props['perimeter']/(np.pi * props['major_axis_length']))
                  ]).T)[0]
         
    #Asymmetry
    A1, A2 = getAsymmetry(imgseg, props['centroid-1'][0], props['centroid-0'][0], props['area'][0])
    #Border Irregularity
    B = getBorderIrregularity(props['perimeter'][0], props['minor_axis_length'][0], props['major_axis_length'][0])
    #Color Features
    CD = getColorFeatures(imgcol, imgseg)
    
    x = np.hstack((x, A1, A2, B, CD))
    if len(X)==0:
        X.append(x)
    else:
        X = np.vstack((X, x))
        
    i+=1
    
dfBIG = pd.DataFrame(X)
dfBIG.columns = ['extent', 'solidity', 'd/D', '4A/(pi*d^2)', 'pi*d/P', '4*pi*A/P^2', 'P/(pi*D)','A1', 'A2', 'B'] +\
             ['F'+str(i) for i in range(1,len(CD)+1)]
dfBIG.head()

In [None]:
scaler = MinMaxScaler()

dftrain = pd.DataFrame(scaler.fit_transform(dfBIG))
Xtrain = np.array(dftrain)
ytrain = np.array(dataBIG['target']=='malignant').astype('int64')
ytrain_ohe = pd.get_dummies(dataBIG['target'])

dftest = pd.DataFrame(scaler.fit_transform(df))
Xtest = np.array(dftest)
ytest = np.array(data['label'])
ytest_ohe = pd.get_dummies(data['label'])

Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

In [None]:
clf1 = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

clf2 = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', svm.SVC())
])

clf3 = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', RandomForestClassifier())
])

clf4 = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', GradientBoostingClassifier())
])

scores1 = cross_val_score(clf1, Xtrain, ytrain, cv=10, scoring=make_scorer(roc_auc_score))
scores2 = cross_val_score(clf2, Xtrain, ytrain, cv=10, scoring=make_scorer(roc_auc_score))
scores3 = cross_val_score(clf3, Xtrain, ytrain, cv=10, scoring=make_scorer(roc_auc_score))
scores4 = cross_val_score(clf4, Xtrain, ytrain, cv=10, scoring=make_scorer(roc_auc_score))

In [None]:
clfs = ['LogisticRegression', 'SVC', 'RandomForest', 'GradientBoosting']
means = [scores1.mean(), scores2.mean(), scores3.mean(), scores4.mean()]
stds = [scores1.std(), scores2.std(), scores3.std(), scores4.std()]
x_pos = np.arange(len(clfs))
# Build the plot
fig, ax = plt.subplots()
ax.bar(x_pos, means, yerr=stds, align='center', alpha=0.5, ecolor='black', capsize=10)
ax.set_ylabel('AUC')
ax.set_xticks(x_pos)
ax.set_xticklabels(clfs)
ax.set_title("Performances des modèles")
ax.yaxis.grid(True)

### Partie où j'ai tenté des réseaux de neurones et un XGBoost

In [None]:
# def build_model(input_size=(19)):
#     inp = Input(input_size)
#     d1 = Dense(15, activation='relu')(inp)
#     d2 = Dense(10, activation='relu')(d1)
#     d3 = Dense(7, activation='relu')(d2)
#     d4 = Dense(5, activation='relu')(d3)
#     outp = Dense(2, activation='softmax')(d4)
#     model = Model(inp, outp)
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy", tf.keras.metrics.AUC()]) #, tf.keras.metrics.AUC()
#     return model

# model = build_model()

# h = model.fit(
#     x = dftrain,
#     y = ytrain_ohe, #ce sont les y 'one-hot-encoded'
#     validation_split=0.2,
#     batch_size=1,
#     epochs=5,
#     verbose=1
# )

# plt.plot(h.history['val_accuracy'], label='val_accuracy')
# plt.plot(h.history['val_loss'], label='val_loss')
# plt.plot(h.history[list(h.history.keys())[-1]], label='val_auc')
# plt.legend()
# plt.title("Visualisation des différents paramètres de performance de notre modèle")
# plt.show()

# # h.history['val_accuracy'][-1]

In [1]:
# model = xgb.XGBClassifier(objective='binary:logistic', \
#                           max_depth=6, learning_rate=2.5, eval_metric='auc')
# model.fit(Xtrain, ytrain)
# ypred = model.predict(Xtest)
# cm = confusion_matrix(ytest, (ypred>0.5).astype('int32'))
# print("\n", cm)
# print("\n", np.sum(np.diag(cm))/np.sum(cm))

In [None]:
# Xtrain_xgb = xgb.DMatrix(Xtrain, label=ytrain)
# Xtest_xgb = xgb.DMatrix(Xtest)

# param = {'max_depth': 4, 'eta': 1.1, 'objective': 'binary:logistic'}
# param['nthread'] = 4
# param['eval_metric'] = 'auc'

# evallist = [(Xtrain_xgb, 'train')]

# num_round = 10
# bst = xgb.train(param, Xtrain_xgb, num_round, evallist)
# bst.save_model('xgb.txt')

#bst = xgb.Booster({'nthread': 4})  # init model
#bst.load_model('./xgb.txt')  # load data

# ypred = bst.predict(Xtest_xgb)

# cm = confusion_matrix(ytest, (ypred>0.5).astype('int32'))
# print("\n", cm)
# print("\n", np.sum(np.diag(cm))/np.sum(cm))