In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv("./../data/train.csv")
df_test_nolabel = pd.read_csv("./../data/test_nolabel.csv")
df_sample_submission = pd.read_csv("./../data/sample_submission.csv")

  df_train = pd.read_csv("./../data/train.csv")


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, \
    FunctionTransformer


def to_lowercase(df):
    """
    set all strings to lower case to be easier to group
    """
    df_return = df.apply(lambda col: col.str.lower())
    return df_return

def limit_outlier(df, max_value):
    """
    clamps the max and min value of a series, for prevent outliers
    from destroying statistic values
    """
    df_return = df.clip(0, max_value)
    return df_return

def fix_new_exists(df):
    """
    fixes newExists col to boolean values
    """
    mask = df.iloc[:, 0] == 0
    df[mask] = 1
    return df

def fix_franchise_code(df):
    """
    fixes FranchiseCode to {0, 1}
    """
    df["FranchiseCode"] = df["FranchiseCode"].apply(lambda row: 0 if row <= 1 else 1)
    return df

def fix_revlinecr(df):
    """
    fixes RevLineCr to {0, 1}
    """
    df["RevLineCr"] = df["RevLineCr"].apply(lambda row: 0 if row in ["N", 0] else 1)
    return df

def fix_lowdoc(df):
    """
    fixes LowDoc to {0, 1}
    """
    df["LowDoc"] = df["LowDoc"].apply(lambda row: 1 if row in ["Y", "S"] else 0)
    return df


def fix_disbursment_gross(df):
    """
    fixes disbursment extracting real value from string
    """
    df["DisbursementGross"] = df_train["DisbursementGross"]\
        .str.extract("(\d+(?:,\d+)*(?:\.\d+)?)")[0]\
        .str.replace(",", "")\
        .astype("float")
    return df



preprocessor = ColumnTransformer(
    transformers=[

        # group 1 -> Set to lower and onehot with in freq 3
        ("group_a", Pipeline([
            ("a1", FunctionTransformer(to_lowercase)),
            ("a2", OneHotEncoder(min_frequency=3, handle_unknown='ignore'))
        ]), ["Name"]),

        # group 2 -> Set to lower and onehot with in freq 300
        ("group_b", Pipeline([
            ("b1", FunctionTransformer(to_lowercase)),
            ("b2", OneHotEncoder(min_frequency=300, handle_unknown='ignore'))
        ]), ["City", "Bank", "BankState"]),

        # group 3 -> clamps values [0, 155] and scales in standard dist
        # si se quiere cambiar por otras opciones crear una función como la de abajo...
        ("group_c", Pipeline([
            ("c1", FunctionTransformer(limit_outlier, kw_args={'max_value': 155})),
            ("c2", StandardScaler()),
        ]), ["NoEmp"]),

        # group 4 -> sustituye {0, 1, 2} por {1, 2}
        ("group_d", Pipeline([
            ("d1", FunctionTransformer(fix_new_exists)),
        ]), ["NewExist"]),

        # group 5 -> clamps values [0, 1027] and scales in standard dist
        # si se quiere cambiar por otras opciones crear una función como la de abajo...
        ("group_e", Pipeline([
            ("e1", FunctionTransformer(limit_outlier, kw_args={'max_value': 1027})),
            ("e2", StandardScaler()),
        ]), ["CreateJob"]),

        # group 6 -> transforma la columna a bool
        ("group_f", Pipeline([
            ("f1", FunctionTransformer(fix_franchise_code)),
        ]), ["FranchiseCode"]),

        # group 7 -> transforma la columna a bool
        ("group_g", Pipeline([
            ("g1", FunctionTransformer(fix_revlinecr)),
        ]), ["RevLineCr"]),

        # group 8 -> transforma la columna a bool
        ("group_h", Pipeline([
            ("h1", FunctionTransformer(fix_lowdoc)),
        ]), ["LowDoc"]),

        # group 9 -> extrae el valor con un regex y lo escala
        ("group_i", Pipeline([
            ("i1", FunctionTransformer(fix_disbursment_gross)),
            ("i2", StandardScaler()),
        ]), ["DisbursementGross"]),
    ]
)

preprocessor_label = LabelEncoder()


preprocessor.fit(df_train)
preprocessor_label.fit(df_train["Accept"])

In [4]:
X = preprocessor.transform(df_train)
y = preprocessor_label.transform(df_train["Accept"])

In [5]:
print(X.toarray()[0])
print(y[0])

[ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.         

In [6]:
# empezar a entrenar aquí