In [50]:
import pandas as pd 
import numpy as np

# Reading Data 

In [12]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Set the experiment name
experiment_name = "Credit_Card_Approval"

# MLflow setup
mlflow.set_experiment(experiment_name)
# mlflow.set_tracking_uri("sqlite:///credit_card_approval.db")

# Read the data
data = pd.read_excel(r"C:\Users\ZiedTriki\OneDrive - Brand Delta\Desktop\Zied DS\final_mlops_credit_default\src\Credit_Card_Approval_prediction.xlsx")

# Identify features and target
X = data.drop(columns=['Credit_Card_Approval', 'Ind_ID', 'EMAIL_ID'], axis=1)
y = data['Credit_Card_Approval']

# Initial Processing : 
X['Birthday_count'].fillna(0, inplace=True)  # Replace NaN with 0
X['Age'] = np.abs(np.floor(X['Birthday_count'] / 365)).astype(int)
X = X.drop(columns=['Birthday_count'], axis=1)
X['Employed_days'] = np.abs(X['Employed_days'])

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Define preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('count_frequency', CountFrequencyEncoder())
])

# Create the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ])

# Fit the preprocessor and transform the data
X_processed = preprocessor.fit_transform(X)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['Birthday_count'].fillna(0, inplace=True)  # Replace NaN with 0


In [10]:
X_processed

array([[0.00000e+00, 1.80000e+05, 3.65243e+05, ..., 1.04900e+03,
        1.38000e+03, 7.56000e+02],
       [0.00000e+00, 3.15000e+05, 5.86000e+02, ..., 1.04900e+03,
        1.38000e+03, 7.56000e+02],
       [0.00000e+00, 3.15000e+05, 5.86000e+02, ..., 1.04900e+03,
        1.38000e+03, 7.56000e+02],
       ...,
       [2.00000e+00, 1.80000e+05, 2.47700e+03, ..., 1.04900e+03,
        1.38000e+03, 1.36000e+02],
       [0.00000e+00, 2.70000e+05, 6.45000e+02, ..., 1.01000e+02,
        1.38000e+03, 8.60000e+01],
       [0.00000e+00, 2.25000e+05, 2.85900e+03, ..., 1.04900e+03,
        1.38000e+03, 7.56000e+02]])

In [None]:

# Evaluate the model with selected features
X_selected = rfe.transform(X_processed)

# Apply SMOTE for imbalance handling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_selected, y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2,  stratify=y_resampled, random_state=42)

# Define the objective function for Hyperopt
def objective(params):
    with mlflow.start_run(nested=True):
        mlflow.set_tag('Model', 'XGBoost_Credit_Card_Approval')
        mlflow.log_params(params)
        
        model = XGBClassifier(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            learning_rate=params['learning_rate'],
            reg_alpha=params['reg_alpha'],
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        )
        
        model.fit(X_train, y_train)

        preds_train = model.predict(X_train)
        preds_test = model.predict(X_test)

        accuracy_train = accuracy_score(y_train, preds_train)
        accuracy_test = accuracy_score(y_test, preds_test)
        
        mlflow.log_metric("Train_Accuracy", accuracy_train)
        mlflow.log_metric("Test_Accuracy", accuracy_test)
        
        
        with open("preprocessor.pkl", "wb") as f_out:
            pickle.dump(preprocessor, f_out)
        mlflow.log_artifact("preprocessor.pkl", artifact_path="preprocessor")

        mlflow.xgboost.log_model(model, artifact_path="Models_Mlflow")
        
        return {'loss': -accuracy_test, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 300, 1100, 200)),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5 ),
    'reg_alpha': hp.uniform('reg_alpha', 0, 5),
  
}

# Perform hyperparameter optimization
trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=10,
    trials=trials,
    rstate=np.random.default_rng(42)
)

# Train the final model with the best parameters
best_model = XGBClassifier(
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    learning_rate=best_params['learning_rate'],
    reg_alpha=best_params['reg_alpha'],
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

best_model.fit(X_train, y_train)

# Evaluate the final model
preds_train = best_model.predict(X_train)
preds_test = best_model.predict(X_test)

accuracy_train = accuracy_score(y_train, preds_train)
accuracy_test = accuracy_score(y_test, preds_test)

report = classification_report(y_test, preds_test)

print(f"Best parameters: {best_params}")
print(f"Train Accuracy: {accuracy_train}")
print(f"Test Accuracy: {accuracy_test}")
print("Classification Report:")
print(report)

# Log the final model with MLflow
with mlflow.start_run() as run:
    with open("preprocessor.pkl", "wb") as f_out:
        pickle.dump(preprocessor, f_out)
    mlflow.log_artifact("preprocessor.pkl", artifact_path="preprocessor")

    mlflow.xgboost.log_model(best_model, artifact_path="Models_Mlflow")

    mlflow.log_params(best_params)
    mlflow.log_metric("Train_Accuracy", accuracy_train)
    mlflow.log_metric("Test_Accuracy", accuracy_test)

    mlflow.log_text(report, "classification_report.txt")

# Save the model to a file
model_path = "xgboost_credit_card_approval.pkl"
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

print(f"Model saved to {model_path}")


In [1]:
import pickle 
import pandas as pd

with open(r"C:\Users\ZiedTriki\OneDrive - Brand Delta\Desktop\Zied DS\final_mlops_credit_default\project\mlruns\541889191873401641\41d597bc4c8941659cbd007ddab032bc\artifacts\preprocessor\preprocessor.pkl", "rb") as f:
    preprocessor = pickle.load(f)

new_data = [
    {
        "Annual_income": 50000,
        "Employed_days": 1095,
        "Family_Members": 3,
        "Type_Income": "Pensioner",
        "Housing_type": "Municipal apartment"
    }
]

print(preprocessor.feature_names_in_)


['GENDER' 'Car_Owner' 'Propert_Owner' 'CHILDREN' 'Annual_income'
 'Type_Income' 'EDUCATION' 'Marital_status' 'Housing_type' 'Employed_days'
 'Mobile_phone' 'Work_Phone' 'Phone' 'Type_Occupation' 'Family_Members'
 'Age']


In [2]:
import pandas as pd
import numpy as np
import pickle
from flask import Flask, request, jsonify
import mlflow
from mlflow import MlflowClient




from pprint import pprint

client = MlflowClient()
for rm in client.search_registered_models():
    pprint(dict(rm), indent=4)

{   'aliases': {},
    'creation_timestamp': 1719713719763,
    'description': '',
    'last_updated_timestamp': 1719713719786,
    'latest_versions': [   <ModelVersion: aliases=[], creation_timestamp=1719713719786, current_stage='None', description='', last_updated_timestamp=1719713719786, name='Challenger_Credit_Card_Approval_Model', run_id='48968404727d4175922735ebc9e98f46', run_link='', source='file:///C:/Users/ZiedTriki/OneDrive%20-%20Brand%20Delta/Desktop/Zied%20DS/final_mlops_credit_default/project/mlruns/871424863682295937/48968404727d4175922735ebc9e98f46/artifacts/Models_Mlflow', status='READY', status_message=None, tags={}, user_id=None, version=1>],
    'name': 'Challenger_Credit_Card_Approval_Model',
    'tags': {}}
{   'aliases': {'champion': '1'},
    'creation_timestamp': 1719713689280,
    'description': '',
    'last_updated_timestamp': 1719715091547,
    'latest_versions': [   <ModelVersion: aliases=['champion'], creation_timestamp=1719713689304, current_stage='None',

In [13]:
data = pd.read_excel(r"C:\Users\ZiedTriki\OneDrive - Brand Delta\Desktop\Zied DS\final_mlops_credit_default\src\Credit_Card_Approval_prediction.xlsx")
data.columns

Index(['Ind_ID', 'GENDER', 'Car_Owner', 'Propert_Owner', 'CHILDREN',
       'Annual_income', 'Type_Income', 'EDUCATION', 'Marital_status',
       'Housing_type', 'Birthday_count', 'Employed_days', 'Mobile_phone',
       'Work_Phone', 'Phone', 'EMAIL_ID', 'Type_Occupation', 'Family_Members',
       'Credit_Card_Approval'],
      dtype='object')

In [2]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor 
from sklearn.feature_extraction import DictVectorizer


In [4]:
preprocessor = DictVectorizer()
model = RandomForestRegressor()

pipeline = make_pipeline(preprocessor,
                         model)

In [5]:
pipeline