In [1]:
import pandas as pd
import os
from os.path import dirname, abspath
import numpy as np
import sklearn
import warnings
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
from config import config_variables
print(config_variables)

{'PIPELINE_SWITCH': {'PIPELINE_TEST': False}, 'FEATURE_SET_VARS': {'IMPUTATION': 'Linear', 'MIN_THRESHOLD': 5}, 'PANDAS': {'WEIGHT': 80, 'HEIGHT': 180, 'CONTROL': False}, 'FOLDERS': {'DATA': 'data', 'RAW_FEATURESET': 'raw_featureset', 'TEMPORAL': 'temporal_data', 'POST_PROCESSED_FEATURESET': 'post_processed_featureset'}, 'FEATURE_SET': {'RAW_FEATURESET_EXCEL': 'Datos.xlsx', 'SPLIT_LABEL_NAME': 'HeartDisease', 'FEATURESET_EXCEL': 'Data_featureset.xlsx', 'LABELS_EXCEL': 'Data_labelset.xlsx', 'POSTPROCESSING_STEPS': {'DATA_IMPUTATION': True}}}


In [3]:
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [12]:
class HeartDataFile(object):
    def __init__(self, config_variables):
        self.config = config_variables
        self.base_path = (abspath(os.getcwd()))

        self.data_folder = config_variables['FOLDERS']['DATA']
        self.raw_data_folder = config_variables['FOLDERS']['RAW_FEATURESET']
        self.raw_excel_file = config_variables['FEATURE_SET']['RAW_FEATURESET_EXCEL']
        self.path_input_excel = os.path.join(self.base_path, self.data_folder, self.raw_data_folder, self.raw_excel_file)
        self.raw_heart_df = pd.read_excel(self.path_input_excel)

        self.temporal_data_folder = config_variables['FOLDERS']['TEMPORAL']
        self.label_name = config_variables['FEATURE_SET']['SPLIT_LABEL_NAME']
        self.target_name = config_variables['FEATURE_SET']['SPLIT_LABEL_NAME']

        self.feature_file = config_variables['FEATURE_SET']['FEATURESET_EXCEL']
        self.output_feature_file = os.path.join(self.base_path, self.data_folder, self.temporal_data_folder, self.feature_file)
        self.features = pd.read_excel(self.output_feature_file)
    
  
        self.label_file = config_variables['FEATURE_SET']['LABELS_EXCEL']
        self.output_label_file = os.path.join(self.base_path, self.data_folder, self.temporal_data_folder, self.label_file)
        

        #self.post_data_folder = config_variables['FOLDERS']['POST_PROCESSED_FEATURESET']
        #self.post_processed_featureset_excel_file = config_variables['FEATURE_SET']['POSTPROCESSED_FEATURESET_EXCEL']
        #self.path_output_excel = os.path.join(self.base_path, self.data_folder, self.post_data_folder, self.post_processed_featureset_excel_file)
        #self.feature_heart_df = pd.read_excel(self.output_feature_file)
    
    def split_labels_target(self, config_variables):
         #Check if folder exists
        if os.path.exists(os.path.join(self.base_path, self.data_folder, self.temporal_data_folder)) == False:
            #Create folder
            os.mkdir(os.path.join(self.base_path, self.data_folder, self.temporal_data_folder))
        else:
             print('Folder already exists')
        
        #Check if featureset file exists 
        if os.path.exists(os.path.join(self.base_path, self.data_folder, self.temporal_data_folder, self.output_feature_file)) == True:
            print('Feature set file already exists')

        #Check if label file exists 
        if os.path.exists(os.path.join(self.base_path, self.data_folder, self.temporal_data_folder, self.output_label_file)) == True:
            print('Label set file already exists')

        #Split labels
        
        self.feature_set = self.raw_heart_df.loc[:, self.raw_heart_df.columns != self.label_name]
        self.feature_df = self.feature_set.to_excel(self.output_feature_file)
        
        #Split target 
        
        self.label_set = self.raw_heart_df[self.label_name]
        self.label_df = self.label_set.to_excel(self.output_label_file)


    def data_imputation(self, config_variables):
        #Read excel feature file
        self.feature_excel = os.path.join(self.base_path, self.data_folder, self.temporal_data_folder, self.output_label_file)
        self.feature_heart_df = pd.read_excel(self.feature_excel)

        #Load input featureset
        #inputdata_df = self.feature_heart_df

        #Separate numeric variables 
        self.integer_features = self.feature_heart_df.select_dtypes(exclude="object").columns
        #print(self.integer_features)

        #Separate categorical variables 
        self.categorical_features = self.feature_heart_df.select_dtypes(include="object").columns
        #print(self.categorical_features)

        #Delete outliers
        #self.feature_heart_df.boxplot(self.integer_features)

        #Define num_pipeline
        #Standarization 
        num_pipeline = Pipeline([
               ('std_scaler', StandardScaler()),
        ])

        #Normalization
        num_pipeline = Pipeline([
               ('scaler', MinMaxScaler()),
        ])

        
        #Apply One Hot Encoding
        full_pipeline = ColumnTransformer([
            ("num", num_pipeline, self.integer_features),
            ("cat", OneHotEncoder(), self.categorical_features),
        ])

        heart_prepared = full_pipeline.fit_transform(self.feature_heart_df)
        #print(heart_prepared)

        names = get_feature_names(full_pipeline)
        #print(names)

        #Delete first column 
        features = pd.DataFrame(heart_prepared,columns=names)
        features_final  = features.iloc[: , 1:]
        #print(features_final)
 
    def build_featureset_definitive(self):
        #Check if featureset exists
        if os.path.exists(self.path_output_excel) == False:
            #Check if folder exists
            if os.path.exists(os.path.join(self.base_path, self.data_folder, self.post_data_folder)) == False:
                #Create folder
                os.mkdir(os.path.join(self.base_path, self.data_folder, self.post_data_folder))
            
            #Load input featureset
            #inputdata_df = self.feature_heart_df
            
            #Preprocessing steps
            #if self.config['FEATURE_SET']['POSTPROCESSING_STEPS']['DATA_IMPUTATION']:
            #    inputdata_df = self.data_imputation(inputdata_df)

            #Save postprocessed featureset 
            #inputdata_df.to_excel(self.path_output_excel)

        else: 
            print('No need to build featureset, file already exists')

In [10]:
heartdata = HeartDataFile(config_variables)

In [11]:
heartdata.split_labels_target(config_variables)

Folder already exists


TypeError: join() argument must be str, bytes, or os.PathLike object, not 'DataFrame'

In [16]:
#I should delete the first column 
heartdata.data_imputation(config_variables)


Index(['Unnamed: 0', 'Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR',
       'Oldpeak'],
      dtype='object')




In [None]:
class Preprocessing
    def __init__(self, arg):
        self.arg = arg (attributes necessary for the loading of data)
        
    def __repr__(self):
        return (f'{self.__class__.__name__}'
                f'(rank={self.rank!r}, suit={self.suit!r})')

    def __eq__(self, other):
        if other.__class__ is not self.__class__:
            return NotImplemented
        return (self.rank, self.suit) == (other.rank, other.suit)

In [None]:
class Data:
      def __init__(self, arg):
          self.arg = arg (attributes necessary for the loading of data)
          
      def load_data(self):
          "load the data from somewhere and reshape it so that it can be used by method1"
          return(data)

      def method1(self, data):
          "do some transformation on the data"
          return(data1)

      def method2(self, data1):
          "do some transformation on the data1"
          return(data2)

      def method3(self, data2):
           "do some transformation on the data2"
          return(data3)

      def run(self):
           data = self.load()
           data1 = self.method1(data)
           data2 = self.method2(data1)
           data3 = self.method3(data2)
           return(data3)

In [None]:
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)
def call(self, inputs):
    return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())

In [None]:
std_layer = Standardization()
std_layer.adapt(data_sample)

In [None]:
#Separate target variable
col = "HeartDisease"
heart = data.loc[:, data.columns != col]
heart.head()
heart

In [None]:
numeric_heart = heart.select_dtypes(include=[np.number])
categorical_heart = heart.select_dtypes(exclude=[np.number])

numeric_heart.shape[1]
categorical_heart.shape[1]


In [None]:
#If data.isnull().any()

In [None]:
num_pipeline = Pipeline([
               ('std_scaler', StandardScaler()),
    ])

In [None]:

num_attribs = integer_features
cat_attribs = categorical_features
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])