# Final Arc

In [3]:
# General
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Prepo 1 (Visualisasi)
import missingno as mno
from fast_ml.feature_selection import get_duplicate_features

# Prepo 2
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import FunctionTransformer

# Model
from imblearn.over_sampling import ADASYN
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, make_scorer, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [4]:
feature = pd.read_csv('Credit_card.csv') 
label = pd.read_csv('Credit_card_label.csv')

In [5]:
# Gabungkan data feature dengan label dengan Ind_ID
data = pd.merge(feature, label, on='Ind_ID')
data

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,label
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2,1
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2,1
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,5028645,F,N,Y,0,,Commercial associate,Higher education,Married,House / apartment,-11957.0,-2182,1,0,0,0,Managers,2,0
1544,5023655,F,N,N,0,225000.0,Commercial associate,Incomplete higher,Single / not married,House / apartment,-10229.0,-1209,1,0,0,0,Accountants,1,0
1545,5115992,M,Y,Y,2,180000.0,Working,Higher education,Married,House / apartment,-13174.0,-2477,1,0,0,0,Managers,4,0
1546,5118219,M,Y,N,0,270000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,-15292.0,-645,1,1,1,0,Drivers,2,0


In [6]:
# Menghapus kolom Ind_ID
data = data.drop(columns=['Ind_ID'])

In [7]:
# Ubah tipe Mobile_phone, Work_phone, Phone, Email, dan label menjadi kategori
data['Mobile_phone'] = data['Mobile_phone'].astype('category')
data['Work_Phone'] = data['Work_Phone'].astype('category')
data['Phone'] = data['Phone'].astype('category')
data['EMAIL_ID'] = data['EMAIL_ID'].astype('category')
data['label'] = data['label'].astype('category')

data = data.drop(columns=['Mobile_phone'])
data = data.drop(columns=['Propert_Owner'])

In [8]:
data

Unnamed: 0,GENDER,Car_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,label
0,M,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,0,0,0,,2,1
1,F,Y,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,0,,2,1
2,F,Y,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,0,,2,1
3,F,Y,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,0,,2,1
4,F,Y,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,0,,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,F,N,0,,Commercial associate,Higher education,Married,House / apartment,-11957.0,-2182,0,0,0,Managers,2,0
1544,F,N,0,225000.0,Commercial associate,Incomplete higher,Single / not married,House / apartment,-10229.0,-1209,0,0,0,Accountants,1,0
1545,M,Y,2,180000.0,Working,Higher education,Married,House / apartment,-13174.0,-2477,0,0,0,Managers,4,0
1546,M,Y,0,270000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,-15292.0,-645,1,1,0,Drivers,2,0


In [9]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np

X = data.drop(columns="label")
y = data["label"]

# Define the feature engineering function
def feature_engineering(df):
    df["Years_Employed"] = df["Employed_days"] // 365
    df["Birthday_count"] = df["Birthday_count"].abs()
    df["Income_per_Family_Member"] = df["Annual_income"] / df["Family_Members"]
    df["Children_per_Family_Member"] = df["CHILDREN"] / df["Family_Members"]
    df["Is_Employed"] = df["Employed_days"] > 0
    return df

# Create the FunctionTransformer
feature_engineering_transformer = FunctionTransformer(feature_engineering)

# Update the feature lists
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Define the numeric transformer
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

# Define the categorical transformer
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Define the preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Create and apply pipeline
pipeline = Pipeline(
    steps=[
        ("feature_engineering", feature_engineering_transformer),
        ("preprocessor", preprocessor),
    ]
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocess the data
X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)

In [11]:
X_train_processed.info()

AttributeError: 'csr_matrix' object has no attribute 'info'

In [None]:
data['Car_Owner'].unique()

array(['Y', 'N'], dtype=object)

In [None]:
import joblib

# Load the model
model = joblib.load("CatBoost_final.pkl")

# Print the model's expected features
print(model.feature_names_)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55']


In [None]:
data['Work_Phone'].unique()

[0, 1]
Categories (2, int64): [0, 1]

In [None]:
data['Phone'].unique()

[0, 1]
Categories (2, int64): [0, 1]

In [None]:
data['EMAIL_ID'].unique()

[0, 1]
Categories (2, int64): [0, 1]

In [None]:
pipeline

In [None]:
import joblib

# Assuming 'pipeline' is your preprocessing pipeline and 'xgb_model' is your trained model

# Save the preprocessing pipeline
joblib.dump(pipeline, 'preprocessing_pipeline.pkl')

['preprocessing_pipeline.pkl']

In [None]:
!pip install streamlit



In [None]:
input_df = get_user_input()

# Ensure the required feature is present
required_feature = 'Type_Occupation_Secretaries'
if required_feature not in input_df.columns:
    input_df[required_feature] = 0  # or another appropriate default value

# Preprocess the input data
input_processed = pipeline.transform(input_df)

# Load the model
model = joblib.load("CatBoost_final_2.pkl")

# Make predictions
prediction = model.predict(input_processed)
prediction_prob = model.predict_proba(input_processed)

# Display the prediction and probabilities
st.write(f"Prediction: {'Positive' if prediction[0] == 1 else 'Negative'}")
st.write(f"Prediction Probability: {prediction_prob[0]}")

if __name__ == "__main__":
    main()

NameError: name 'get_user_input' is not defined