In [43]:
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(os.path.join('../')))


from src.utils.data_extractor import DataExtractor
from src.utils.eda_analyzer import EDAAnalyzer
from src.utils.data_preprocessor import DataPreprocessor
from src.utils.model_trainer import LightGBMAutoML


loan_filepath = "../data/loan.csv"
loan_df = DataExtractor.extract_csv(file_path=loan_filepath)


pre_processor = DataPreprocessor(loan_df)
replacements = [
    {
        'column': 'fpStatus',
        'from': np.nan,
        'to': 'NoAchAttempt'
    },
    {
        'column': 'nPaidOff',
        'from': np.nan,
        'to': 0
    }  
    ]

# Create a preprocessing pipeline
X_train, X_test, y_train, y_test =  (
    pre_processor
    .drop_rows_with_nulls_in_columns(columns=['loanId', 'apr', 'loanAmount', 'payFrequency', 'loanStatus'])
    .replace_values(replacements)
    .handle_missing_values(strategy='most_frequent', columns=['state'])
 
    .drop_columns(["loanId", 'applicationDate', 'originatedDate', 'clarityFraudId', 'state'])
    .drop_duplicates()
    .split_data(target_column="loanStatus")
)

2024-09-18 01:25:49,030 - INFO - Extracted data from CSV file: ../data/loan.csv
2024-09-18 01:25:49,604 - INFO - Dropped rows with null values in column loanId. Before: (577682, 19), After: (577426, 19)
2024-09-18 01:25:49,813 - INFO - Dropped rows with null values in column apr. Before: (577426, 19), After: (573760, 19)
2024-09-18 01:25:50,026 - INFO - Dropped rows with null values in column loanAmount. Before: (573760, 19), After: (573731, 19)
2024-09-18 01:25:50,250 - INFO - Dropped rows with null values in column payFrequency. Before: (573731, 19), After: (572512, 19)
2024-09-18 01:25:50,497 - INFO - Dropped rows with null values in column loanStatus. Before: (572512, 19), After: (572377, 19)
2024-09-18 01:25:50,498 - INFO - Final shape after dropping rows with null values in columns: ['loanId', 'apr', 'loanAmount', 'payFrequency', 'loanStatus']. Initial shape: (577682, 19), Final shape: (572377, 19)
2024-09-18 01:25:50,576 - INFO - Replaced nan with NoAchAttempt in column: fpStatu

In [108]:
X_train.dtypes

anon_ssn                            category
payFrequency                        category
apr                                  float64
originated                              bool
nPaidOff                             float64
approved                                bool
isFunded                               int64
loanAmount                           float64
originallyScheduledPaymentAmount     float64
leadType                            category
leadCost                               int64
fpStatus                            category
hasCF                                  int64
dtype: object

In [124]:
categorical_features = [col for col in X_train.columns if X_train[col].dtype == 'category']

for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')
    

sample = X_test.head(1).to_dict(orient='records')[0]
columns = list(sample.keys())
values = list(sample.values())

In [126]:
sample

{'anon_ssn': '562ce550f0b6dfb9372d44ec79e8c908',
 'payFrequency': 'B',
 'apr': 359.0,
 'originated': False,
 'nPaidOff': 0.0,
 'approved': False,
 'isFunded': 0,
 'loanAmount': 500.0,
 'originallyScheduledPaymentAmount': 1018.77,
 'leadType': 'bvMandatory',
 'leadCost': 3,
 'fpStatus': 'NoAchAttempt',
 'hasCF': 0}

In [110]:
categorical_features

['anon_ssn', 'payFrequency', 'leadType', 'fpStatus']

In [125]:
import requests
import json


sample = X_test.head(1).to_dict(orient='records')[0]
columns = list(sample.keys())
values = list(sample.values())

# Sample input data
data = {
    "dataframe_split": {
        "columns": columns,
        "data": [values]
    }
    
}

headers = {
    'Content-Type': 'application/json',
}

# Send a request to the model's local endpoint
response = requests.post(
    url="http://127.0.0.1:8001/invocations",
    headers=headers,
    json=data
)
# dataframe_split
# dataframe_records

# Print the prediction result
print("Prediction response:", response.json())

{'dataframe_split': {'columns': ['anon_ssn', 'payFrequency', 'apr', 'originated', 'nPaidOff', 'approved', 'isFunded', 'loanAmount', 'originallyScheduledPaymentAmount', 'leadType', 'leadCost', 'fpStatus', 'hasCF'], 'data': [['562ce550f0b6dfb9372d44ec79e8c908', 'B', 359.0, False, 0.0, False, 0, 500.0, 1018.77, 'bvMandatory', 3, 'NoAchAttempt', 0]]}}
Prediction response: {'predictions': ['Withdrawn Application']}


In [28]:
import mlflow.pyfunc

model_uri = "mlruns/0/cbc495a818554cf5b9381401491d928b/artifacts/model"
model = mlflow.pyfunc.load_model(model_uri).get_raw_model()
model

In [29]:
model.

AttributeError: 'AutoML' object has no attribute 'model_version'

In [23]:
model.__version__

'2.2.0'

In [9]:
import mlflow
from mlflow.utils import env_manager as _EnvManager


mlflow.models.build_docker(
    model_uri="runs:/ff624604f8eb44d59b1211e52f1fe564/model",
    name="loan_predictor",
    enable_mlserver=False,
    install_java=True,
    install_mlflow=True,
    env_manager=_EnvManager.CONDA
)

Downloading artifacts:  86%|████████▌ | 6/7 [00:00<00:00, 721.79it/s] 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 309.93it/s]
2024/09/18 17:17:36 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 362.70it/s] 
2024/09/18 17:17:36 INFO mlflow.pyfunc.backend: Building docker image with name loan_predictor
#0 building with "desktop-linux" instance using docker driver

#1 [internal] load .dockerignore
#1 transferring context: 2B done
#1 DONE 0.0s

#2 [internal] load build definition from Dockerfile
#2 transferring dockerfile: 2.00kB done
#2 DONE 0.0s

#3 [internal] load metadata for docker.io/library/ubuntu:20.04
#3 ERROR: failed to do request: Head "https://registry-1.docker.io/v2/library/ubuntu/manifests/20.04": dialing registry-1.docker.io:443 with direct connection: resolving host registry-1.docker.io: lookup registry-1.docker.io on 127.0.0.53:53: no such host
------
 > [internal] load metadata for docker.io/library/ubuntu:20.04:
------
Dockerfi

RuntimeError: Docker build failed.

In [122]:
mod.predict(X_test.head(1))
# Print the mapping
# print(f"Class labels: {class_labels}")

# Now map the predicted value [2] back to the category
# predicted_class = class_labels[response[0]]
# print(f"Predicted category: {predicted_class}")

array(['Withdrawn Application'], dtype=object)

In [121]:
mod.classes_

array(['CSR Voided New Loan', 'Charged Off Paid Off',
       'Credit Return Void', 'Customer Voided New Loan',
       'Customver Voided New Loan', 'External Collection',
       'Internal Collection', 'New Loan', 'Paid Off Loan',
       'Pending Application', 'Pending Application Fee',
       'Pending Paid Off', 'Pending Rescind', 'Rejected', 'Returned Item',
       'Settled Bankruptcy', 'Settlement Paid Off', 'Voided New Loan',
       'Withdrawn Application'], dtype=object)

In [111]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train, categorical_feature=categorical_features)

In [112]:
categorical_features


['anon_ssn', 'payFrequency', 'leadType', 'fpStatus']

In [114]:

clf.classes_

array(['CSR Voided New Loan', 'Charged Off Paid Off',
       'Credit Return Void', 'Customer Voided New Loan',
       'Customver Voided New Loan', 'External Collection',
       'Internal Collection', 'New Loan', 'Paid Off Loan',
       'Pending Application', 'Pending Application Fee',
       'Pending Paid Off', 'Pending Rescind', 'Rejected', 'Returned Item',
       'Settled Bankruptcy', 'Settlement Paid Off', 'Voided New Loan',
       'Withdrawn Application'], dtype=object)