In [3]:
import np as np
import pandas as pd
from sklearn import set_config
import warnings
from sklearn.utils import resample
import tensorflow as tf

set_config(display='diagram')
warnings.filterwarnings('ignore')

In [4]:
df_train = pd.read_csv("./../data/train.csv").dropna()
df_test_nolabel = pd.read_csv("./../data/test_nolabel.csv")
# df_sample_submission = pd.read_csv("./../data/sample_submission.csv").dropna()

In [5]:
df_train

Unnamed: 0,id,LoanNr_ChkDgt,Name,City,State,Bank,BankState,ApprovalDate,ApprovalFY,NoEmp,...,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementDate,DisbursementGross,BalanceGross,Accept
0,2fa57387ae1,6213754009,SETANTA SETTERS,PITTSBURGH,PA,CITIZENS BANK NATL ASSOC,MA,1-Apr-03,2003,1,...,1,2,1,1,Y,N,30-Jun-03,"$5,440.00",$0.00,1
1,a66f9849d7f,1751224008,DURACLEAN BY SEITZER,MONTOURSVILLE,PA,MANUFACTURERS & TRADERS TR CO,NY,19-Dec-97,1998,20,...,0,0,25850,0,N,Y,28-Feb-98,"$33,800.00",$0.00,1
2,fb6a6d89487,6979444003,PC HAINES WALL & CEILING CONTR,SALUNGA,PA,CITIZENS BANK NATL ASSOC,RI,1-Dec-03,2004,9,...,0,9,1,2,Y,N,29-Feb-04,"$156,599.00",$0.00,1
3,0ac44b54067,5825754005,LESLIE A LANDIS,YORK,PA,MANUFACTURERS & TRADERS TR CO,MD,31-Oct-02,2003,3,...,2,5,1,1,0,N,31-Jan-03,"$40,000.00",$0.00,1
4,3a5ff625fa3,8989333001,"MAILBOXES, ETC.",PHILADELPHIA,PA,NEWTEK SMALL BUS. FINANCE INC.,NY,7-Feb-96,1996,3,...,0,0,50564,0,N,Y,31-Oct-96,"$100,000.00",$0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24614,ae972125b13,7387374007,HUNTINGDON CREAMERY,HUNTINGDON,PA,KISH BANK,PA,27-Apr-04,2004,2,...,2,0,1,1,0,N,31-May-04,"$20,000.00",$0.00,1
24615,95d6fb854b9,1175195009,MAJIK D.J. ENTERTAINMENT,FLEETWOOD,PA,CITIZENS BANK NATL ASSOC,RI,31-Jan-05,2005,2,...,0,2,1,1,N,N,30-Apr-06,"$10,000.00",$0.00,1
24616,5e446a47a0f,1797704005,MCCULLOUGH TOWING & STORAGE,PHILA,PA,SUSQUEHANNA BANK,PA,22-Jan-98,1998,3,...,0,0,1,0,N,Y,31-Mar-98,"$60,000.00",$0.00,1
24617,549dcfeb04e,3816914004,HALLS FLORIST,WILLIAMSPORT,PA,MANUFACTURERS & TRADERS TR CO,NY,29-Jun-00,2000,4,...,0,3,1,1,Y,N,31-Aug-04,"$10,000.00",$0.00,1


In [6]:
# Separate majority and minority classes
df_majority = df_train[df_train["Accept"] == 1]
df_minority = df_train[df_train["Accept"] == 0]

# Downsample majority class
df_majority_downsampled = resample(df_majority, replace=False, n_samples=(len(df_minority)+300), random_state=42)

# Combine minority class with downsampled majority class
df_train_resampled = np.concatenate((df_majority_downsampled, df_minority), axis=0)
df_train_resampled = pd.DataFrame(df_train_resampled, columns=df_train.columns)

df_train = df_train_resampled

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, \
    FunctionTransformer


def to_lowercase(df):
    """
    set all strings to lower case to be easier to group
    """
    df = df.apply(lambda col: col.str.lower())
    return df

def limit_outlier(df, max_value):
    """
    clamps the max and min value of a series, for prevent outliers
    from destroying statistic values
    """
    df = df.clip(0, max_value)
    return df

def fix_new_exists(df):
    """
    fixes newExists col to boolean values
    """
    df["NewExist"] = df["NewExist"].apply(lambda row: 0 if row in [0, 1] else 1)
    return df

def fix_franchise_code(df):
    """
    fixes FranchiseCode to {0, 1}
    """
    df["FranchiseCode"] = df["FranchiseCode"].apply(lambda row: 0 if row <= 1 else 1)
    return df

def fix_revlinecr(df):
    """
    fixes RevLineCr to {0, 1}
    """
    df["RevLineCr"] = df["RevLineCr"].apply(lambda row: 0 if row in ["N", 0] else 1)
    return df

def fix_lowdoc(df):
    """
    fixes LowDoc to {0, 1}
    """
    df["LowDoc"] = df["LowDoc"].apply(lambda row: 1 if row in ["Y", "S"] else 0)
    return df


def fix_disbursment_gross(df):
    """
    fixes disbursment extracting real value from string
    """
    df["DisbursementGross"] = df["DisbursementGross"]\
        .str.extract("(\d+(?:,\d+)*(?:\.\d+)?)")[0]\
        .str.replace(",", "")\
        .astype("float")
    return df

def fix_urban_rural(df):
    """
    fixes UrbanRural to {0, 1}
    """
    df["UrbanRural"] = df["UrbanRural"].apply(lambda row: 1 if row in ["Y", "S"] else 0)
    return df

def get_approval_year(df):
    """
    extracts year from date ApprovalDate
    """
    df['ApprovalDate'] = df['ApprovalDate'].astype('datetime64[ns]').dt.year
    return df


preprocessor = ColumnTransformer(
    transformers=[
        # group 1 -> Set to lower and onehot with in freq 3
        # ("group_a", Pipeline([
        #     ("a1", FunctionTransformer(to_lowercase)),
        #     ("a2", OneHotEncoder(handle_unknown='ignore', sparse=True))
        # ]), ["Name"]),

        # group 2 -> Set to lower and onehot with in freq 300
        ("group_b", Pipeline([
            ("b1", FunctionTransformer(to_lowercase)),
            ("b2", OneHotEncoder(handle_unknown='ignore', sparse=False))
        # ]), ["City", "Bank", "BankState"]),
        ]), ["BankState"]),

        # group 3 -> clamps values [0, 155] and scales in standard dist
        # si se quiere cambiar por otras opciones crear una función como la de abajo...
        ("group_c", Pipeline([
            ("c1", FunctionTransformer(limit_outlier, kw_args={'max_value': 155})),
            ("c2", StandardScaler()),
        ]), ["NoEmp"]),

        # # group 4 -> sustituye {0, 1, 2} por {1, 2}
        ("group_d", Pipeline([
            ("d1", FunctionTransformer(fix_new_exists)),
            ("d2", SimpleImputer(strategy="most_frequent")),
        ]), ["NewExist"]),

        # group 5 -> clamps values [0, 1027] and scales in standard dist
        # si se quiere cambiar por otras opciones crear una función como la de abajo...
        ("group_e", Pipeline([
            ("e1", FunctionTransformer(limit_outlier, kw_args={'max_value': 1027})),
            ("e2", StandardScaler()),
        ]), ["CreateJob", "RetainedJob"]),

        # group 6 -> transforma la columna a bool
        # ("group_f", Pipeline([
        #     ("f1", FunctionTransformer(fix_franchise_code)),
        # ]), ["FranchiseCode"]),

        # # group 7 -> transforma la columna a bool
        # ("group_g", Pipeline([
        #     ("g1", FunctionTransformer(fix_revlinecr)),
        # ]), ["RevLineCr"]),
        #
        # group 8 -> transforma la columna a bool
        ("group_h", Pipeline([
            ("h1", FunctionTransformer(fix_lowdoc)),
        ]), ["LowDoc"]),

        # # group 9 -> extrae el valor con un regex y lo escala
        ("group_i", Pipeline([
            ("i1", FunctionTransformer(fix_disbursment_gross)),
            ("d2", SimpleImputer(strategy="mean")),
            ("i2", StandardScaler()),
        ]), ["DisbursementGross"]),
        #
        # # group 10 -> transforma la columna a bool
        # ("group_j", Pipeline([
        #     ("fj", FunctionTransformer(fix_urban_rural)),
        # ]), ["UrbanRural"]),

        # group 11 -> approval year
        ("group_k", Pipeline([
            ("k1", FunctionTransformer(get_approval_year)),
            ("i2", StandardScaler()),
        ]), ["ApprovalDate"]),
    ],
    remainder='drop'
)

# preprocessor_label = LabelEncoder()
preprocessor.fit(df_train)
# preprocessor_label.fit(df_train["Accept"])

In [8]:
X_train = preprocessor.transform(df_train)
y_train = df_train["Accept"].values.astype("int")

In [9]:
print(X_train[1])
print(df_train.iloc[1])
print(y_train[0])

[ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          1.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.36178324  0.         -0.07349567 -0.1693337
  0.          0.5163589  -0.66347525]
id                                  bdb0d0a4eb5
LoanNr_ChkDgt                        3018354008
Name                              LEANAPE GROUP
City                                   PERKASIE
State                                        PA
Bank                 UNIVEST BANK AND TRUST CO.
BankState                                    PA
ApprovalDate                          18-Jun-99
ApprovalFY                                 1999
NoEmp                                        13
NewExist                                    1.0
CreateJob                            

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics


model = GradientBoostingClassifier(
    learning_rate=0.1,
    min_samples_split=5,
    n_estimators=60,
    min_samples_leaf=50,
    max_depth=15,
    max_features='sqrt',
    subsample=0.8,
)

x = df_train.drop('Accept', axis=1)
y = df_train['Accept'].values.astype('int')
x_train, x_test, y_train, y_test = train_test_split(df_train, y, test_size=0.25, random_state=33)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", model)
])

pipeline.fit(x_train, y_train)
print(pipeline.score(x_test, y_test))
y_test_pred = pipeline.predict(x_test)

print("Accuracy in training ", metrics.accuracy_score(y_test, y_test_pred))
print("f-score in training ", metrics.f1_score(y_test, y_test_pred))

pipeline

0.6909739928607853
Accuracy in training  0.6909739928607853
f-score in training  0.7154929577464789


In [11]:
a = preprocessor.transform(x_train)
print(a[0])

[ 0.          0.          0.          0.          0.          1.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.         -0.4794085
  0.         -0.07270161 -0.13426438  0.         -0.52613313  0.72273033]


In [12]:
input_shape = preprocessor.transform(x_train).shape
x_train_processed = preprocessor.transform(x_train)

# Define the model architecture
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, activation='relu', input_shape=(input_shape[1],)),
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model with binary crossentropy loss and accuracy metric
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on the training data
model.fit(x_train_processed, y_train, epochs=100, batch_size=64)


Epoch 1/100


2023-04-02 11:54:55.521775: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x29b4c72e0>

In [13]:
x_test_processed = preprocessor.transform(x_test)
y_test_prob = model.predict(x_test_processed)
y_test_pred = np.round(y_test_prob).ravel()

print("Accuracy in testing ", metrics.accuracy_score(y_test, y_test_pred))
print("f-score in testing ", metrics.f1_score(y_test, y_test_pred))

Accuracy in testing  0.6731259561448241
f-score in testing  0.6865525672371637


In [14]:
print(x_test.shape)
print(y_test.shape)


(1961, 21)
(1961,)


In [18]:
x_test_nolabel_processed = preprocessor.transform(df_test_nolabel)
y_test_nolabel_prob = model.predict(x_test_nolabel_processed)
y_test_nolabel_pred = np.round(y_test_nolabel_prob).ravel()

sample_out = pd.DataFrame()
sample_out["id"] = df_test_nolabel["id"]
sample_out["Accept"] = y_test_nolabel_pred.astype("int")



In [19]:
from datetime import datetime

now = datetime.now()
date_string = now.strftime("%Y%m%d_%H%M")

sample_out.to_csv(f"./../data/submission_{date_string}.csv", index=False)
sample_out

Unnamed: 0,id,Accept
0,82e99051e9c,1
1,df1cfad8fb6,0
2,2d3d3198980,0
3,3f56f41d280,0
4,cca77d2e8a4,0
...,...,...
3269,19ecb5cd698,0
3270,d1ac6d902de,1
3271,e8c19edd044,1
3272,7139f39bee2,1


In [20]:
sample_out.groupby("Accept").size()

Accept
0    1574
1    1700
dtype: int64