In [1]:
import pandas as pd
import numpy as np
import pickle

# **Load Model**

In [79]:
filename = 'final_model_gbc_bank_marketing_campaign.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [80]:
loaded_model

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



# **Create Observation**

In [81]:
new_cust = pd.DataFrame({
    'age': [56],
    'housing' : ['no'],
    'loan' : ['no'],
    'contact' : ['unknown'],
    'balance' : [1000],
    'job' : ['admin'],
    'month' : ['jan'],
    'campaign' : [2],
    'pdays' : [-1],
    'poutcome' : ['failure']
    })
new_cust

Unnamed: 0,age,housing,loan,contact,balance,job,month,campaign,pdays,poutcome
0,56,no,no,unknown,1000,admin,jan,2,-1,failure


In [73]:
new_cust2 = pd.DataFrame({
    'age': [24],
    'housing' : ['no'],
    'loan' : ['no'],
    'contact' : ['cellular'],
    'balance' : [448],
    'job' : ['management'],
    'day' : [5],
    'month' : ['apr'],
    'campaign' : [1],
    'pdays' : [-1],
    'poutcome' : ['unknown']
    })
new_cust2

Unnamed: 0,age,housing,loan,contact,balance,job,day,month,campaign,pdays,poutcome
0,24,no,no,cellular,448,management,5,apr,1,-1,unknown


In [74]:
loaded_model.predict(new_cust)

array(['no'], dtype=object)

In [75]:
loaded_model.predict(new_cust2)

array(['yes'], dtype=object)

In [82]:
df = pd.read_csv("D:\DTI\ML Code\Capstone\data_bank_marketing_campaign.csv")
df.head()

Unnamed: 0,age,job,balance,housing,loan,contact,month,campaign,pdays,poutcome,deposit
0,55,admin.,1662,no,no,cellular,jun,2,-1,unknown,yes
1,39,self-employed,-3058,yes,yes,cellular,apr,3,-1,unknown,yes
2,51,admin.,3025,no,no,cellular,may,1,352,other,yes
3,38,services,-87,yes,no,cellular,may,1,-1,unknown,no
4,36,housemaid,205,yes,no,telephone,nov,4,-1,unknown,no


In [83]:
# Remove the column 'deposit'
X_test = df.drop('deposit',axis=1)

In [84]:
predictions = loaded_model.predict(X_test)

ValueError: Found unknown categories ['admin.'] in column 0 during transform

In [None]:
y_true = df['deposit']
y_pred = loaded_model.predict(X_test)

In [64]:
# Save the DataFrame to a CSV file
df.to_csv('x_test_bank_marketing.csv', index=False)

In [65]:
# Load the trained model and preprocessing pipeline using pickle
filename = 'final_model_gbc_bank_marketing_campaign.sav'
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
import pickle

In [63]:
# Separate the features and the target variable
X = df.drop('deposit', axis=1)
y = df['deposit']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,
    test_size=0.2,
    random_state=0
)

# Define the preprocessing steps with handle_unknown='ignore'
transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), ['job', 'housing', 'loan', 'contact', 'month', 'poutcome'])],
    remainder='passthrough'
)

# Create a pipeline that includes the transformer, scaler, and model
pipeline = Pipeline(steps=[
    ('preprocessor', transformer),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', GradientBoostingClassifier(random_state=0))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Save the entire pipeline (including the preprocessor and scaler)
filename = 'final_model_gbc_bank_marketing_campaign.sav'
with open(filename, 'wb') as file:
    pickle.dump(pipeline, file)

print("Model and preprocessing pipeline saved to 'final_model_gbc_bank_marketing_campaign.sav'.")

Model and preprocessing pipeline saved to 'final_model_gbc_bank_marketing_campaign.sav'.


In [85]:
from sklearn.metrics import mean_squared_error, f1_score, recall_score, classification_report, RocCurveDisplay


In [87]:
transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), ['job', 'housing', 'loan', 'contact', 'month', 'poutcome'])],
    remainder='passthrough'
)

In [89]:
X_train

Unnamed: 0,age,job,balance,housing,loan,contact,month,campaign,pdays,poutcome
4378,30,technician,1011,yes,no,unknown,may,1,-1,unknown
7733,33,services,1082,yes,yes,cellular,nov,1,-1,unknown
1837,51,housemaid,618,yes,yes,cellular,feb,1,-1,unknown
4187,36,management,102,yes,no,cellular,apr,1,-1,unknown
192,45,admin.,0,no,no,cellular,oct,1,-1,unknown
...,...,...,...,...,...,...,...,...,...,...
3444,42,admin.,1143,yes,yes,telephone,nov,1,-1,unknown
1958,36,admin.,8785,yes,no,cellular,nov,2,-1,unknown
2342,56,technician,1561,no,no,cellular,aug,1,-1,unknown
4084,51,services,-553,yes,no,unknown,jun,8,-1,unknown


In [90]:
data = pd.DataFrame(transformer.fit_transform(X_train))
column = transformer.get_feature_names_out()

X_train_processed = pd.DataFrame(data= data, columns = column)

In [93]:
data = pd.DataFrame(transformer.transform(X_test))
column = transformer.get_feature_names_out()

X_test_processed = pd.DataFrame(data= data, columns = column)

In [94]:
model = GradientBoostingClassifier()

model.fit(X_train_processed, y_train)
y_pred = model.transform(X_test_processed)

print(classification_report(y_test, y_pred))

ValueError: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [68]:
# Make predictions using the loaded pipeline
predictions = loaded_model.predict(X)

# Create a DataFrame with predictions
df_predictions = pd.DataFrame(predictions, columns=['predictions'])

df_predictions


Unnamed: 0,predictions
0,yes
1,no
2,yes
3,no
4,no
...,...
7808,yes
7809,yes
7810,no
7811,no
