In [9]:
import os
import pandas as pd
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

In [10]:
# Define the root directory of the project as credit-mlops
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
print(ROOT_DIR)

/workspaces/credit-mlops


In [11]:
# Read the processed data
df = pd.read_csv(os.path.join(ROOT_DIR, 'data', 'processed', 'processed_data.csv'))
df.head(2)

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12


## Normalizing data

In [12]:
from sklearn.preprocessing import StandardScaler

# Create the MinMaxScaler object
scaler = StandardScaler()

# Fit and transform the data
df = scaler.fit_transform(df)

## Autoencoder

### Implement autoencoder

In [13]:
# Define the number of features
input_dim = df.shape[1]
encoding_dim = 10

# Define the input layer
input_layer = Input(shape=(input_dim, ))

# Define the encoder layer
encoder_layer = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(10e-5))(input_layer)

# Define the decoder layer
decoder_layer = Dense(input_dim, activation="relu")(encoder_layer)

# Define the autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoder_layer)

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Define the early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=5, verbose=1, mode='min', restore_best_weights=True)

# Train the autoencoder model
autoencoder.fit(df, df, epochs=100, batch_size=32, shuffle=True, validation_split=0.2, verbose=1, callbacks=[early_stopping])

# Save the autoencoder model
autoencoder.save(os.path.join(ROOT_DIR, 'models', 'autoencoder.pkl'))

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

INFO:tensorflow:Assets written to: /workspaces/credit-mlops/models/autoencoder.pkl/assets


### Apply the encoder model to get the reduced dimension

In [14]:
# Get the encoder model
encoder_model = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer(index=1).output)

# Get the reduced data using the encoder
reduced_data = encoder_model.predict(df)

# Convert the reduced data to a DataFrame
reduced_df = pd.DataFrame(reduced_data, columns=[f"latent_{i+1}" for i in range(encoding_dim)])

# Save the reduced dataset to a CSV file
reduced_df.to_csv(os.path.join(ROOT_DIR, 'data', 'processed', 'featurized_data.csv'), index=False)



In [15]:
# Check the shape of the reduced data
print(reduced_df.shape)

(7346, 10)
