In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from datetime import datetime

from minisom import MiniSom
import matplotlib.pyplot as plt

In [23]:
class Data:
    # Load Adult dataset and seperate to features(X) and target(y)
    def __init__(self, path='D:/昱安在政大/108學年度上學期課程/資安/data/adult.csv'):
        df = shuffle(pd.read_csv(path, engine='python'))
        df = self.clean(df)

        self.y = df.pop('income')
        self.X = df

        # Label encode y
        self.y_encoder = LabelEncoder()
        self.y = self.y_encoder.fit_transform(self.y)

        # One Hot encode X
        self.X = pd.get_dummies(self.X)

        for name in self.X.columns:
            if self.X[name].dtype == 'object':
                self.X[name] = self.X[name].astype('category')

    def clean(self, df):
        return df.replace('?', np.nan).dropna().drop('fnlwgt', axis=1)

    def train_test_split(self):
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, test_size=0.2)
        y_train = pd.Series(y_train, index=X_train.index)
        y_test = pd.Series(y_test, index=X_test.index)
        return (X_train, X_test, y_train, y_test)

# Data + SOM

In [None]:
som_start=datetime.now()

data = Data()

quasi_identifiers = ['age', 'educational-num',
                         'capital-gain', 'capital-loss', 'hours-per-week']
features = data.X[quasi_identifiers]
target = data.y

#要調整
width=150
height=150
sigma=.9
lr=.2
epochs=1e5
#epochs = 10

verbose=True
log = 1000

som = MiniSom(width, height, features.shape[1], sigma, lr)
som.train_random(features.values, int(epochs), verbose=True)

out = []
for step, (X, y) in enumerate(zip(features.values, target)):
    new_X = som.winner(X)
    out.append((new_X, X, y))
    if(verbose == True and step % log == 0):
        print(f'*Creating SOM: [{step}/{features.shape[0]}]')
som_data = np.array(out)

new_data = []
new_X = []
X = []
y = []
for i in range(0, len(som_data[:,0])):
    new_X.append(np.asarray(som_data[:,0][i]))
    X.append(np.asarray(som_data[:,1][i]))
    y.append(np.asarray(som_data[:,2][i]))
new_data = (new_X, X, y)

print("Time required for SOM: " + str(datetime.now()-som_start))

new_data

In [20]:
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from collections import Counter, defaultdict

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Sequential
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

In [21]:
class TrainingModel:
    def __init__(self, input_shape):
        self.model = Sequential()
        self.model.add(Dense(64, activation='relu', input_shape=input_shape))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(128, activation='relu'))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(128, activation='relu'))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

    def fit(self, data, label):
        self.model.fit(data, label, epochs=1, batch_size=128, verbose=0)

    def predict(self, data):
        return self.model.predict_classes(data)
    
    def evaluate(self, X_test, y_test, print_report=True):
        y_predicted = self.predict(X_test)
        y_predicted_probs = self.model.predict_proba(X_test)
        if print_report:
            self.print_report(y_test, y_predicted, y_predicted_probs)
        else:
            accuracy = accuracy_score(y_test, y_predicted)
            report = classification_report(y_test, y_predicted, output_dict=True)
            auc_score = roc_auc_score(y_test, y_predicted_probs)
            matrix = confusion_matrix(y_test, y_predicted)

            return {
                'accuracy': accuracy,
                'auc_score': auc_score,
                **report['weighted avg'],
            }

    def print_report(self, test, predicted, predicted_probs):
        accuracy = accuracy_score(test, predicted)
        report = classification_report(test, predicted)
        matrix = confusion_matrix(test, predicted)

        print('Accuracy score: {:.5f}'.format(accuracy))
        print('-' * 20)
        print('Confusion Matrix:')
        print(matrix)
        print('-' * 20)
        print(report)
        print('-' * 20)
        print('AUC score: {:.5f}'.format(roc_auc_score(test, predicted_probs)))

# K means + Model

In [None]:
#將SOM資料集丟入Kmeans
#Kmeans分群數依照k值(k-anonymity的k)決定，從K = x(資料總比數)/k(理想上)開始遞減直到每一群至少有k筆資料
#計算跑的時間

sizes = [5, 10, 15, 20, 30, 50]
#k = 5
#K = int(len(new_data[0])//k)
#D = 500//k #Decline
#cluster = True
#c = Counter()

for k in sizes:
    
    perturb_start=datetime.now()
    
    print("K-Anonymity: k=" + str(k))
    K = int(len(new_data[0])//k) #Set the initial number of clusters
    D = 500//k #Set how many clusters to drop for every next loop
    cluster = True
    while cluster:
        clf = KMeans(n_clusters=K)
        clf.fit(new_data[0])
        c = Counter(clf.labels_)
        print("K=" + str(K))
        for i in range(0,K):
            print("cluster " + str(i) + " has " + str(c[i]) + " data points")
            if c[i]<k:
                break
            else:
                if i == K-1:
                    cluster = False
                else:
                    pass
        K = K-D
    K = K+D
    print("The Resulting number of clusters: " + str(K))
    
    #將分完群的資料還原回原始資料的feature
    #根據分群結果perturb
    #準備丟入神經網路

    data = pd.concat([pd.DataFrame(new_data[1],columns=['age', 'educational-num','capital-gain', 'capital-loss', 'hours-per-week']), 
                      pd.DataFrame(new_data[2],columns=['target']), pd.DataFrame(clf.labels_,columns=['cluster'])], axis = 1)
    columns = ['age', 'educational-num','capital-gain', 'capital-loss', 'hours-per-week','target','cluster']
    index = range(0,len(data))
    data_perturbed = pd.DataFrame(index = index, columns=columns)

    for i in range(0, len(data)):
        for j in range(0,K+1):
            if data['cluster'][i] == j:
                data_perturbed[columns[0]][i] = float(data.groupby(by='cluster').mean()[columns[0]][j])
                data_perturbed[columns[1]][i] = float(data.groupby(by='cluster').mean()[columns[1]][j])
                data_perturbed[columns[2]][i] = float(data.groupby(by='cluster').mean()[columns[2]][j])
                data_perturbed[columns[3]][i] = float(data.groupby(by='cluster').mean()[columns[3]][j])
                data_perturbed[columns[4]][i] = float(data.groupby(by='cluster').mean()[columns[4]][j])
                data_perturbed['target'][i] = data['target'][i]
                data_perturbed['cluster'][i] = data['cluster'][i]

    data_sorted = data_perturbed.sort_values(by=['cluster'])
    data_ready = data_sorted.drop('cluster', axis=1)
    
    print("Time required for perturbation: " + str(datetime.now()-perturb_start))
    
    #神經網路
    
    X_train, X_test, y_train, y_test = train_test_split(data_ready.iloc[:,0:5], np.array(data_ready50.iloc[:,5:], dtype=int), test_size=0.2)
    model = TrainingModel((5,))
    model.fit(X_train, y_train)
    model.evaluate(X_test, y_test, print_report=True)