In [2]:
import np as np
import pandas as pd

In [3]:
df_train = pd.read_csv("./../data/train.csv").dropna()
df_test_nolabel = pd.read_csv("./../data/test_nolabel.csv")
df_sample_submission = pd.read_csv("./../data/sample_submission.csv").dropna()

In [4]:
df_train

Unnamed: 0,id,LoanNr_ChkDgt,Name,City,State,Bank,BankState,ApprovalDate,ApprovalFY,NoEmp,...,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementDate,DisbursementGross,BalanceGross,Accept
0,2fa57387ae1,6213754009,SETANTA SETTERS,PITTSBURGH,PA,CITIZENS BANK NATL ASSOC,MA,1-Apr-03,2003,1,...,1,2,1,1,Y,N,30-Jun-03,"$5,440.00",$0.00,1
1,a66f9849d7f,1751224008,DURACLEAN BY SEITZER,MONTOURSVILLE,PA,MANUFACTURERS & TRADERS TR CO,NY,19-Dec-97,1998,20,...,0,0,25850,0,N,Y,28-Feb-98,"$33,800.00",$0.00,1
2,fb6a6d89487,6979444003,PC HAINES WALL & CEILING CONTR,SALUNGA,PA,CITIZENS BANK NATL ASSOC,RI,1-Dec-03,2004,9,...,0,9,1,2,Y,N,29-Feb-04,"$156,599.00",$0.00,1
3,0ac44b54067,5825754005,LESLIE A LANDIS,YORK,PA,MANUFACTURERS & TRADERS TR CO,MD,31-Oct-02,2003,3,...,2,5,1,1,0,N,31-Jan-03,"$40,000.00",$0.00,1
4,3a5ff625fa3,8989333001,"MAILBOXES, ETC.",PHILADELPHIA,PA,NEWTEK SMALL BUS. FINANCE INC.,NY,7-Feb-96,1996,3,...,0,0,50564,0,N,Y,31-Oct-96,"$100,000.00",$0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24614,ae972125b13,7387374007,HUNTINGDON CREAMERY,HUNTINGDON,PA,KISH BANK,PA,27-Apr-04,2004,2,...,2,0,1,1,0,N,31-May-04,"$20,000.00",$0.00,1
24615,95d6fb854b9,1175195009,MAJIK D.J. ENTERTAINMENT,FLEETWOOD,PA,CITIZENS BANK NATL ASSOC,RI,31-Jan-05,2005,2,...,0,2,1,1,N,N,30-Apr-06,"$10,000.00",$0.00,1
24616,5e446a47a0f,1797704005,MCCULLOUGH TOWING & STORAGE,PHILA,PA,SUSQUEHANNA BANK,PA,22-Jan-98,1998,3,...,0,0,1,0,N,Y,31-Mar-98,"$60,000.00",$0.00,1
24617,549dcfeb04e,3816914004,HALLS FLORIST,WILLIAMSPORT,PA,MANUFACTURERS & TRADERS TR CO,NY,29-Jun-00,2000,4,...,0,3,1,1,Y,N,31-Aug-04,"$10,000.00",$0.00,1


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, \
    FunctionTransformer


def to_lowercase(df):
    """
    set all strings to lower case to be easier to group
    """
    df_return = df.apply(lambda col: col.str.lower())
    return df_return

def limit_outlier(df, max_value):
    """
    clamps the max and min value of a series, for prevent outliers
    from destroying statistic values
    """
    df_return = df.clip(0, max_value)
    return df_return

def fix_new_exists(df):
    """
    fixes newExists col to boolean values
    """
    df["NewExist"] = df["NewExist"].apply(lambda row: 1 if row in [0, 1] else 2)
    return df

def fix_franchise_code(df):
    """
    fixes FranchiseCode to {0, 1}
    """
    df["FranchiseCode"] = df["FranchiseCode"].apply(lambda row: 0 if row <= 1 else 1)
    return df

def fix_revlinecr(df):
    """
    fixes RevLineCr to {0, 1}
    """
    df["RevLineCr"] = df["RevLineCr"].apply(lambda row: 0 if row in ["N", 0] else 1)
    return df.astype(np.int64)

def fix_lowdoc(df):
    """
    fixes LowDoc to {0, 1}
    """
    df["LowDoc"] = df["LowDoc"].apply(lambda row: 1 if row in ["Y", "S"] else 0)
    return df.astype(np.int64)


def fix_disbursment_gross(df):
    """
    fixes disbursment extracting real value from string
    """
    df["DisbursementGross"] = df_train["DisbursementGross"]\
        .str.extract("(\d+(?:,\d+)*(?:\.\d+)?)")[0]\
        .str.replace(",", "")\
        .astype("float")
    return df

def fix_urban_rural(df):
    """
    fixes UrbanRural to {0, 1}
    """
    df_return = df[df["UrbanRural"] == 0]
    return df


training_numeric = ["NoEmp", "NewExist", "CreateJob", "FranchiseCode", "RevLineCr", "LowDoc", "DisbursementGross"]

preprocessor = ColumnTransformer(
    transformers=[
        # group 1 -> Set to lower and onehot with in freq 3
        # ("group_a", Pipeline([
        #     ("a1", FunctionTransformer(to_lowercase)),
        #     ("a2", OneHotEncoder(handle_unknown='ignore', sparse=True))
        # ]), ["Name"]),

        # group 2 -> Set to lower and onehot with in freq 300
        # ("group_b", Pipeline([
        #     ("b1", FunctionTransformer(to_lowercase)),
        #     ("b2", OneHotEncoder(handle_unknown='ignore', sparse=True))
        # ]), ["City", "Bank", "BankState"]),

        # group 3 -> clamps values [0, 155] and scales in standard dist
        # si se quiere cambiar por otras opciones crear una función como la de abajo...
        ("group_c", Pipeline([
            ("c1", FunctionTransformer(limit_outlier, kw_args={'max_value': 155})),
            ("c2", StandardScaler()),
        ]), ["NoEmp"]),

        # # group 4 -> sustituye {0, 1, 2} por {1, 2}
        ("group_d", Pipeline([
            ("d1", FunctionTransformer(fix_new_exists)),
            ("d2", SimpleImputer(strategy="most_frequent")),
        ]), ["NewExist"]),

        # group 5 -> clamps values [0, 1027] and scales in standard dist
        # si se quiere cambiar por otras opciones crear una función como la de abajo...
        ("group_e", Pipeline([
            ("e1", FunctionTransformer(limit_outlier, kw_args={'max_value': 1027})),
            ("e2", StandardScaler()),
        ]), ["CreateJob"]),

        # group 6 -> transforma la columna a bool
        ("group_f", Pipeline([
            ("f1", FunctionTransformer(fix_franchise_code)),
        ]), ["FranchiseCode"]),

        # # group 7 -> transforma la columna a bool
        # ("group_g", Pipeline([
        #     ("g1", FunctionTransformer(fix_revlinecr)),
        # ]), ["RevLineCr"]),
        #
        # # group 8 -> transforma la columna a bool
        # ("group_h", Pipeline([
        #     ("h1", FunctionTransformer(fix_lowdoc)),
        # ]), ["LowDoc"]),

        # group 9 -> extrae el valor con un regex y lo escala
        ("group_i", Pipeline([
            ("i1", FunctionTransformer(fix_disbursment_gross)),
            ("d2", SimpleImputer(strategy="mean")),
            ("i2", StandardScaler()),
        ]), ["DisbursementGross"]),

        # group 10 -> transforma la columna a bool
        ("group_j", Pipeline([
            ("fj", FunctionTransformer(fix_urban_rural)),
        ]), ["UrbanRural"]),
    ],
)

preprocessor_label = LabelEncoder()
preprocessor.fit(df_train)
preprocessor_label.fit(df_train["Accept"])

  mode = stats.mode(array)


LabelEncoder()

In [6]:
training_cols = ["Name", "City", "Bank", "BankState", "NoEmp", "NewExist", "CreateJob", "FranchiseCode", "RevLineCr", "LowDoc", "DisbursementGross"]

In [7]:
X_train = preprocessor.transform(df_train)
y_train = preprocessor_label.transform(df_train["Accept"])

In [18]:
print(X_train[0])
print(y_train[0])

[-0.49026163  1.         -0.04053341  0.         -0.61370357  1.        ]
1


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

model = RandomForestClassifier(max_depth=2, min_samples_leaf=2, n_estimators=80)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
print("Accuracy in training ", metrics.accuracy_score(y_train, y_train_pred))
print("f-score in training ", metrics.f1_score(y_train, y_train_pred))


Accuracy in training  0.8551732652893242
f-score in training  0.9210508603863325


In [10]:
# # Define the model architecture
# model = tf.keras.models.Sequential([
#   tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
#   tf.keras.layers.Dropout(0.2),
#   tf.keras.layers.Dense(32, activation='relu'),
#   tf.keras.layers.Dropout(0.2),
#   tf.keras.layers.Dense(1, activation='sigmoid')
# ])
#
# # Compile the model with binary crossentropy loss and accuracy metric
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#
# # Train the model on the training data
# model.fit(X_train, y_train, epochs=10, batch_size=32)


In [11]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=10)
print("Scores in every iteration", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Scores in every iteration [0.85843621 0.85308642 0.85802469 0.85390947 0.8563786  0.85432099
 0.85473251 0.85061728 0.85837793 0.85384932]
Accuracy: 0.86 (+/- 0.00)


In [12]:
X_test = df_test_nolabel

X_test_transform = preprocessor.transform(X_test)
y_test_pred = model.predict(X_test_transform)

In [13]:
sample_out = pd.DataFrame()
sample_out["id"] = X_test["id"]
sample_out["Accept"] = y_test_pred

In [14]:
from datetime import datetime

now = datetime.now()
date_string = now.strftime("%Y%m%d_%H%M")

sample_out.to_csv(f"./../data/submission_{date_string}.csv", index=False)
sample_out

Unnamed: 0,id,Accept
0,82e99051e9c,1
1,df1cfad8fb6,1
2,2d3d3198980,1
3,3f56f41d280,1
4,cca77d2e8a4,1
...,...,...
3269,19ecb5cd698,1
3270,d1ac6d902de,1
3271,e8c19edd044,1
3272,7139f39bee2,1
