# Notebook de funciones

## Función de Gini

In [11]:
from sklearn.metrics import roc_auc_score

def GS(a,b):
    """""
    Función que recibe dos parámetros;
    :a: una variable binaria que representa 0 = bueno y 1 = malo (objetivo)
    :b: predicción de la primera variable (continua, entera o binaria)
    :return: coeficiente GINI de las dos variables anteriores. """
    
    gini = 2*roc_auc_score(a,b)-1
    return gini

## Funcion de entranamiento de algoritmos

In [12]:
def train_method(x_train, y_train, x_test, y_test, method):  
    """
    Funcion para entrenar un modelo con el método seleccionado
    El entrenamiento y algoritmos son de la libreria de sklearn.
    :param x_train: numpy array, required
    :param y_train: numpy array, required
    :param x_test: numpy array, required
    :param y_test: numpy array, required    
    :return: object
        - Modelo entrenado según los datos
    """    
    if method == 'LR':  # Linear Regresssion
        return LR(x_train, y_train, x_test, y_test)
    
    elif method == 'LOGR': # Logistic Regresssion
        return LOGR(x_train, y_train, x_test, y_test)

    elif method == 'DT': # Decision Tree Classifier
        return DT(x_train, y_train, x_test, y_test)    
    
    elif method == 'LASSO': # Lasso Regresssion 
        return LASSO(x_train, y_train, x_test, y_test)
    
    elif method == 'RIDGE': # Ridge Regresssion
        return RIDGE(x_train, y_train, x_test, y_test)
    
    elif method == 'RFR': # Random Forest Regressor
        return RFR(x_train, y_train, x_test, y_test)

    elif method == 'RFC': # Random Forest Classifier
        return RFC(x_train, y_train, x_test, y_test)    

    elif method == 'GBR': # Gradient Boosting Regression
        return GBR(x_train, y_train, x_test, y_test)

## Funciones resumen de algoritmos

In [13]:
from sklearn.linear_model import LinearRegression
def LR(X_train, y_train, X_test, y_test):
    """
    Linear Regresssion
    """
    model = LinearRegression().fit(X_train, y_train)
    return dictionary_of_measures(model, X_train, y_train, X_test, y_test)

from sklearn.linear_model.logistic import LogisticRegression
def LOGR(X_train, y_train, X_test, y_test):
    """
    Logistic Regresssion
    """
    model = LogisticRegression().fit(X_train, y_train)
    return dictionary_of_measures(model, X_train, y_train, X_test, y_test)


from sklearn.tree import DecisionTreeClassifier
def DT(X_train, y_train, X_test, y_test):
    """
    Decision Tree Classifier
    """
    model = DecisionTreeClassifier(random_state=99).fit(X_train, y_train)
    return dictionary_of_measures(model, X_train, y_train, X_test, y_test)


from sklearn.linear_model import Lasso
def LASSO(X_train, y_train, X_test, y_test):
    """
    Lasso Regresssion
    """
    model = Lasso(alpha = 0.01).fit(X_train, y_train)
    return dictionary_of_measures(model, X_train, y_train, X_test, y_test)

from sklearn.linear_model import Ridge
def RIDGE(X_train, y_train, X_test, y_test):
    """
    Ridge Regresssion
    """
    model = Ridge(alpha = 0.01).fit(X_train, y_train)
    return dictionary_of_measures(model, X_train, y_train, X_test, y_test)

from sklearn.ensemble import RandomForestRegressor
def RFR(X_train, y_train, X_test, y_test):
    """
    Random Forest Regressor
    """
    model = RandomForestRegressor(n_estimators=1000, min_samples_split=2).fit(X_train, y_train)
    return dictionary_of_measures(model, X_train, y_train, X_test, y_test)


from sklearn.ensemble import RandomForestClassifier
def RFC(X_train, y_train, X_test, y_test):
    """
    Random Forest Classifier
    """
    model = RandomForestClassifier(n_estimators=1000, min_samples_split=2).fit(X_train, y_train)
    return dictionary_of_measures(model, X_train, y_train, X_test, y_test)

from sklearn.ensemble import GradientBoostingRegressor
def GBR(X_train, y_train, X_test, y_test):
    """
    Gradient Boosting Regression
    """
    model = GradientBoostingRegressor(n_estimators=1000,alpha=0.01).fit(X_train, y_train)
    return dictionary_of_measures(model, X_train, y_train, X_test, y_test)

## Función de entrenamiento de modelos

In [14]:
def dictionary_of_measures(model, X_train, y_train, X_test, y_test):
    # MEDIDAS DE PRUEBA
    try: # Si es un método de clasificación, usamos la probabilidad.
        y_pred_train = model.predict_proba(X_train)[:,1] #seleccionamos solo la columna de prob. igual a 1
    except:
        y_pred_train = model.predict(X_test)
        
    a_train = model.score(X_train, y_train)
    gini_train = GS(y_train,y_pred_train) # he modificado: tenia y_test y no y_train    

    # MEDIDAS DE TEST
    try: # Si es un método de clasificación, usamos la probabilidad.
        y_pred_test = model.predict_proba(X_test)[:,1] #seleccionamos solo la columna de prob. igual a 1
    except:
        y_pred_test = model.predict(X_test) 
        
    a_test = model.score(X_test, y_test)
    gini_test = GS(y_test,y_pred_test)    

    return {'model':model,'accuracy_train':a_train,'accuracy_test':a_test,
            'gini_train':gini_train,'gini_test':gini_test}

## Modelos de Machine Learning

In [15]:
classifier_list = """{"svm_linear": SVC(probability=True, kernel='linear', C=1.0),
                       "svm_poly": SVC(probability=True, kernel='poly', C=1.0),
                       "svm_rbf": SVC(probability=True, kernel='rbf', C=1.0, gamma=0.01),
                       "linear_svc": LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.1, C=1.0, multi_class='ovr', fit_intercept=True,
                                               intercept_scaling=1, random_state=None, max_iter=3000),
                       "knn": KNeighborsClassifier(n_neighbors=100, weights='distance', leaf_size=30, n_jobs=n_jobs),
                       "random_forests": RandomForestClassifier(n_estimators=350, criterion='entropy', min_samples_split=2,
                                                                min_samples_leaf=1, max_leaf_nodes=600, n_jobs=n_jobs),
                       "logistic_regression": LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=2.4, fit_intercept=True, intercept_scaling=1,
                                                                 random_state=None, solver='liblinear', max_iter=1000, multi_class='ovr',
                                                                 warm_start=False, n_jobs=n_jobs),
                       "decision_trees": DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2,
                                                                min_samples_leaf=100, min_weight_fraction_leaf=0.0, max_features=None,
                                                                random_state=None, max_leaf_nodes=None, presort=False),
                       "sgd": SGDClassifier(alpha=.0001, n_iter=500, penalty="elasticnet", n_jobs=n_jobs),
                       "neural_network": Classifier(layers=[Layer("Sigmoid", units=14), Layer("Sigmoid", units=13), Layer("Sigmoid", units=12),
                                                            Layer("Sigmoid", units=10), Layer("Softmax")], learning_rate=0.01, n_iter=200,
                                                    batch_size=10, regularize='L1', n_stable=50, dropout_rate=0, verbose=True),
                       "GBC": GradientBoostingClassifier(max_depth=10, max_leaf_nodes=850, min_samples_leaf=15, learning_rate=0.1),
                       "XGB": XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
                                            max_depth=10, min_child_weight=2, missing=None, n_estimators=100, nthread=n_jobs, reg_alpha=0,
                                            objective='binary:logistic', reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1)}"""


NameError: name 'y_true' is not defined