In [1]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import StandardScaler
from keras.layers import Input, Dense
from keras.models import Model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns




In [2]:
USELESS_DATA_BOUNDARY= 0.5

In [3]:
class DataSet(object):
    def __init__(self, csv_path):
        self.data = pd.read_csv(csv_path)
        #print(self.data)
        self.processed_data = self.data
        #self.missing_value_process()

        #self.raw_predictor = self.data.iloc[:,:-1].values
        #self.raw_response = self.data.iloc[:,-1].values

        self.useless_value_process()
        self.missing_value_process()
        self.tag_transfer()
        self.imputer_process()
        self.normalize()

    def useless_value_process(self):
        col_del = ['examide', 'citoglipton', 'glimepiride-pioglitazone']
        self.processed_data.drop(col_del, axis=1, inplace = True)
    
    def missing_value_process(self):
        row_num = len(self.data)
        col_num = len(self.data.columns)
        cols_to_drop = []
        for col in range(col_num):
            qm_num = 0
            for row in range(row_num):
                if str(self.data.iat[row,col]) == "?":
                    qm_num += 1

            #print(qm_num/row_num)
            if qm_num/row_num >= USELESS_DATA_BOUNDARY:
                #print("true")
                cols_to_drop.append(self.data.columns[col])
        
        self.processed_data.drop(columns=cols_to_drop, axis=1, inplace = True)

    def imputer_process(self):
        # Replace the question marks with the most frequently appeared value for each feature
        imp = SimpleImputer(missing_values= -1, strategy='most_frequent')
        imp.fit(self.processed_data)


    def tag_transfer(self): 

        le = LabelEncoder()
        #self.processed_data.replace('?', -1, inplace=True)
        value_to_keep = "?"

        # 遍历DataFrame的每一列
        for column in self.processed_data.columns:
            # 如果列的数据类型是对象（通常意味着它是分类类型）
            if self.processed_data[column].dtype == 'object':
                # 对列应用 LabelEncoder
                values_to_encode = self.processed_data[column] != value_to_keep
                #original_column = self.processed_data[column].copy()
                # 对非特定值应用 LabelEncoder
                encoded_values = le.fit_transform(self.processed_data.loc[values_to_encode, column])
                self.processed_data[column] = le.fit_transform(self.processed_data[column])
                #self.processed_data[column] = pd.Series(encoded_values, index=self.processed_data.index[values_to_encode]).astype(int)
                #self.processed_data.loc[~values_to_encode, column] = original_column[~values_to_encode]
                self.processed_data[column].fillna(self.processed_data[column], inplace=True)

        #print(self.processed_data)
    #I have changed        

        #self.processed_data = self.raw_data.dropna()

    def normalize(self):

        # Normalize data using StandardScaler
        scaler = StandardScaler()
        self.processed_data = scaler.fit_transform(self.processed_data)

        # Print normalized data
        print(self.processed_data)

    def autoencoder(self):

        # Example patient data
        # x_train = ...

        # Normalize your data
        scaler = StandardScaler()
        x_train_scaled = scaler.fit_transform(self.data)

        # This is the size of our encoded representations
        encoding_dim = 20  # Adjust based on your needs

        # This is our input placeholder
        input_data = Input(shape=(self.data.shape[1],))

        # "encoded" is the encoded representation of the input
        encoded = Dense(encoding_dim, activation='relu')(input_data)

        # "decoded" is the lossy reconstruction of the input
        decoded = Dense(self.data.shape[1], activation='sigmoid')(encoded)

        # This model maps an input to its reconstruction
        autoencoder = Model(input_data, decoded)

        # This model maps an input to its encoded representation
        encoder = Model(input_data, encoded)

        autoencoder.compile(optimizer='adam', loss='mean_squared_error')

        # Train the model
        autoencoder.fit(x_train_scaled, x_train_scaled, epochs=50, batch_size=256, shuffle=True, validation_split=0.2)

        # Use encoder to transform data into lower-dimensional space
        encoded_data = encoder.predict(x_train_scaled)

In [4]:
train_data = DataSet('diabetic_data_training.csv')
test_data = DataSet('diabetic_data_test.csv')

#train_data.data
train_data.processed_data, test_data.processed_data

train_data.processed_data.to_csv("test_csv.csv",sep=';',index=False)


In [None]:
train_data = pd.read_csv('diabetic_data_training.csv')
test_data = pd.read_csv('diabetic_data_test.csv')

predictor = train_data.iloc[:,:-1].values
response = train_data.iloc[:,-1].values

predictor,response 

(array([[149190, 55629189, 'Caucasian', ..., 'No', 'Ch', 'Yes'],
        [64410, 86047875, 'AfricanAmerican', ..., 'No', 'No', 'Yes'],
        [500364, 82442376, 'Caucasian', ..., 'No', 'Ch', 'Yes'],
        ...,
        [443854148, 41088789, 'Caucasian', ..., 'No', 'Ch', 'Yes'],
        [443857166, 31693671, 'Caucasian', ..., 'No', 'Ch', 'Yes'],
        [443867222, 175429310, 'Caucasian', ..., 'No', 'No', 'No']],
       dtype=object),
 array(['>30', 'NO', 'NO', ..., 'NO', 'NO', 'NO'], dtype=object))