In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.validation import check_is_fitted
import glob

In [2]:
df = pd.read_csv("clean_dataset.csv", dtype = {"cnae": str})

In [3]:
df

Unnamed: 0,cnae,p49100_h1,p40800_h1,p40100_mas_40500_h1,p31200_h1,p32300_h1,p10000_h1,p20000_h1,target_status
0,6420,5.469000e+06,-9.649000e+06,5.257400e+07,4.561200e+07,1.474900e+07,1.236410e+08,2.838500e+07,0
1,6419,0.000000e+00,0.000000e+00,4.619200e+07,0.000000e+00,0.000000e+00,1.339125e+09,1.026990e+08,0
2,7010,1.911000e+06,-2.529000e+06,3.567900e+07,1.149100e+07,6.909000e+06,6.484900e+07,3.111500e+07,0
3,4299,1.237430e+06,-5.139340e+05,3.243692e+07,4.977184e+06,4.288033e+06,3.337327e+07,4.985911e+06,0
4,6420,4.554036e+06,-3.087892e+06,2.982766e+07,2.697055e+07,5.497509e+06,1.067062e+08,4.068739e+07,0
...,...,...,...,...,...,...,...,...,...
34396,4110,-2.780139e+01,-6.235400e+01,6.664871e+03,0.000000e+00,0.000000e+00,2.110689e+03,1.976485e+03,1
34397,4399,2.613884e+03,-2.394252e+03,6.660514e+03,4.961825e+04,3.071683e+03,5.841598e+04,3.148817e+03,1
34398,7120,-2.199949e+03,-7.873460e+02,6.640654e+03,1.396128e+03,1.176326e+03,1.643043e+04,1.105404e+04,1
34399,4771,5.225575e+02,-1.332870e+02,6.639432e+03,3.989042e+03,7.580835e+02,9.566275e+03,1.960267e+03,0


In [4]:
# _h0, _h1, _h2
# _h0: history 0, here h0 means the year 2017 (historia 0, aquí h0 significa el año 2017)
# _h1: history -1, here h1 means the year 2016 (historia -1, aquí h1 significa el año 2016)
# _h2: history -2, here h2 means the year 2015 (historia -2, aquí h2 significa el año 2015)

# Ebita Margin - Ebitda / Turn over (Ventas)
# p49100: Profit (Resultado del ejercicio)
# p40800: Amortization (Amortización) 
# p40100: Sales Turnover (Ingresos de Explotación)
# p40500: Other sales (Otros Ingresos)
df['ebitda_income'] = (df.p49100_h1+df.p40800_h1)/(df.p40100_mas_40500_h1) 

# Total Debt / Ebita 
# p31200: Short Term Debt / Deuda a corto plazo
# p32300: Long Term Debt / Deuda a largo plazo
# p49100: Profit (Resultado del ejercicio)
# p40800: Amortization (Amortización) 
df['debt_ebitda'] =(df.p31200_h1 + df.p32300_h1) /(df.p49100_h1+df.p40800_h1) 

# rraa_rrpp: Financial leveraging / apalancamiento financiero 
# p10000: Total Assets / Total activos
# p20000: Own Capital / Patrimonio neto
df['rraa_rrpp'] = (df.p10000_h1 - df.p20000_h1) /df.p20000_h1

# Log of Operating Income
df['log_operating_income'] = np.log(df.p40100_mas_40500_h1)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
df_clean = df[['ebitda_income','debt_ebitda','rraa_rrpp','log_operating_income','target_status', 'cnae']].replace([np.inf, -np.inf], np.nan).dropna()
X = df_clean[['ebitda_income','debt_ebitda','rraa_rrpp','log_operating_income', 'cnae']]
y = df_clean['target_status']

In [6]:
#Categrical features to pass down the categorical pipeline 
categorical_features = ["cnae"]

#Numerical features to pass down the numerical pipeline 
numerical_features = ['ebitda_income','debt_ebitda','rraa_rrpp','log_operating_income']

In [7]:
class CNAE_Transformer(BaseEstimator, TransformerMixin ):   

    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):  
        X = X.copy()
        X.loc[:, "sector"] = X.cnae.str[:2]
        X.sector = X.sector.str.strip()
        X = X.replace({"sector":""}, "missing")
        return X

In [8]:
class Mean_Imputer(BaseEstimator, TransformerMixin ):   

    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):  
        numeric_column_names = X.select_dtypes(include =["float64", "int"]).columns
        X = X.copy()
        X[numeric_column_names] = X[numeric_column_names].fillna(X.mean())
        return X

In [47]:

class GroupNormalizer(BaseEstimator, TransformerMixin):
    '''
    Class used for imputing missing values in a pd.DataFrame using either mean or median of a group.
    
    Parameters
    ----------    
    group_cols : list
        List of columns used for calculating the aggregated value 
    target : str
        The name of the column to impute
    metric : str

    Returns
    -------
    X : array-like
        The array with imputed values in the target column
    '''
    def __init__(self, group_cols, target):
        
        self.group_cols = group_cols
        self.target = target
    
    def fit(self, X, y=None):
        
        assert pd.isnull(X[self.group_cols]).any(axis=None) == False, 'There are missing values in group_cols'
        
        impute_map = X.groupby(self.group_cols)[self.target].agg([np.mean, np.std]) \
                                                            .reset_index(drop=False)
        
        self.impute_map_ = impute_map.fillna(impute_map.median())

        
        return self
    
    def normalizer(self, df):

        for sel in self.target:
            df[sel] = (df[sel] - df[(sel,'mean')])/df[(sel,'std')]
        return df
        
    
    def transform(self, X, y=None):
        
        # make sure that the imputer was fitted
        check_is_fitted(self, 'impute_map_')
        
       # X = X.copy()
        df_final = pd.DataFrame(columns = X.columns)
        #Concateno la tabla, con los valores de std y means calculados previamente
        #de esta forma no tengo que preocuparme luego por el acceso a los datos ya que los tengo en cada fila de la tabla
        df_final = pd.merge(X, self.impute_map_, on='sector')
        df_final = self.normalizer(df_final)
        #limpio el dataset antes de devolverlo
        df_final = df_final.iloc[:,list(range(len(self.target)+2))]
  
            
        return df_final

In [48]:
pp = Pipeline([("CNAE_Transformer", CNAE_Transformer()), ("Mean_Imputer", Mean_Imputer()), ("standarize", GroupNormalizer(["sector"], numerical_features))])
Z = pp.fit_transform(X)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [49]:
Z

Unnamed: 0,ebitda_income,debt_ebitda,rraa_rrpp,log_operating_income,cnae,sector
0,0.029133,-0.002067,-0.025911,3.569887,6420,64
1,0.029134,0.056674,-0.035501,3.278373,6420,64
2,0.029134,0.043782,-0.036295,3.154621,6420,64
3,0.029134,0.032551,-0.023272,3.134902,6420,64
4,0.029134,0.021225,-0.041034,3.068262,6420,64
...,...,...,...,...,...,...
33824,-1.154599,1.110003,1.154681,-1.154701,7500,75
33825,0.590545,-0.830537,-0.583200,0.577350,7500,75
33826,-0.707107,0.707107,0.707107,-0.707107,0910,09
33827,0.707107,-0.707107,-0.707107,0.707107,0910,09


----------

A partir de aquí borrador son borradores y pruebas

In [None]:
class Standarizer( BaseEstimator, TransformerMixin ):   

    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        self.statistics_group = dict()
        self.statistics_total = dict()
        self.numeric_column_names = X.select_dtypes(include =["float64", "int"]).columns
        for group, df in X.groupby("sector"):
            self.statistics_group[group] = {
                                "mean":{cn: df[cn].mean() for cn in self.numeric_column_names},
                                "std": {cn: df[cn].std() for cn in self.numeric_column_names}                        
                                }

        self.statistics_total = { "mean":{cn: df[cn].mean() for cn in self.numeric_column_names},
                             "std": {cn: df[cn].std() for cn in self.numeric_column_names}                        
                            }       
        return self

    def normalize(self, x, group):
        value_normalized = (x - self.statistics_group[group]["mean"]) / (self.statistics_group[group]["std"])         
        return value_normalized
        
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):      
        X.apply(lambda x: self.normalize(X[self.numeric_column_names],X["sector"]), axis = 1)
        return X

In [None]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 