In [100]:
import warnings  # DO NOT modify this line

import numpy as np
import pandas as pd
from sklearn.exceptions import ConvergenceWarning  # DO NOT modify this line
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

warnings.filterwarnings(
    "ignore", category=ConvergenceWarning
)  # DO NOT modify this line


class BankLogistic:
    def __init__(self, data_path):  # DO NOT modify this line
        self.data_path = data_path
        self.df = pd.read_csv(data_path, sep=",")
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None

    def Q1(self):  # DO NOT modify this line
        """
        Problem 1:
            Load ‘bank-st.csv’ data from the “Attachment”
            How many rows of data are there in total?

        """
        # TODO: Paste your code here
        return self.df.shape[0]

    def Q2(self):  # DO NOT modify this line
        """
        Problem 2:
            return the tuple of numeric variables and categorical variables are presented in the dataset.
        """
        # TODO: Paste your code here
        num_cols = self.df.select_dtypes(include=["number"])
        obj_cols = self.df.select_dtypes(include=["object"])

        return (num_cols.shape[1], obj_cols.shape[1])

    def Q3(self):  # DO NOT modify this line
        """
        Problem 3:
            return the tuple of the Class 0 (no) followed by Class 1 (yes) in 3 digits.
        """
        # TODO: Paste your code here
        no, yes = self.df["y"].value_counts()
        return (round(no / self.df.shape[0], 3), round(yes / self.df.shape[0], 3))

    def Q4(self):  # DO NOT modify this line
        """
        Problem 4:
            Remove duplicate records from the data. What are the shape of the dataset afterward?
        """
        # TODO: Paste your code here
        self.df.drop_duplicates(inplace=True)

        return self.df.shape

    def Q5(self):  # DO NOT modify this line
        """
        Problem 5:
            5. Replace unknown value with null
            6. Remove features with more than 99% flat values.
                Hint: There is only one feature should be drop
            7. Split Data
            -	Split the dataset into training and testing sets with a 70:30 ratio.
            -	random_state=0
            -	stratify option
            return the tuple of shapes of X_train and X_test.

        """
        # TODO: Paste your code here
        self.df.drop_duplicates(inplace=True)

        flat_cols = self.df.apply(
            lambda col: col.value_counts(normalize=True).max() >= 0.90
        )
        self.df.drop(columns=flat_cols[flat_cols].index, inplace=True)

        y = self.df.pop("y")
        X = self.df

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, stratify=y, test_size=0.3, random_state=0
        )
        # drop = don't keep the old index
        self.X_train.reset_index(drop=True, inplace=True)
        self.X_test.reset_index(drop=True, inplace=True)
        self.y_train.reset_index(drop=True, inplace=True)
        self.y_test.reset_index(drop=True, inplace=True)

        return self.X_train.shape, self.X_test.shape

    def onehot_cols(self, X_train: pd.DataFrame, X_test: pd.DataFrame, nominal_cols: pd.Series):
        enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        enc.fit(X_train[nominal_cols])

        train_enc_df = pd.DataFrame(enc.transform(X_train[nominal_cols]))
        test_enc_df = pd.DataFrame(enc.transform(X_test[nominal_cols]))

        new_col_names = enc.get_feature_names_out(nominal_cols)
        train_enc_df.columns = new_col_names
        test_enc_df.columns = new_col_names

        X_train = pd.concat([X_train.drop(columns=nominal_cols), train_enc_df], axis=1)
        X_test = pd.concat([X_test.drop(columns=nominal_cols), test_enc_df], axis=1)

        return X_train, X_test


    def Q6(self):
        """
        Problem 6:
            8. Impute missing
                -	For numeric variables: Impute missing values using the mean.
                -	For categorical variables: Impute missing values using the mode.
                Hint: Use statistics calculated from the training dataset to avoid data leakage.
            9. Categorical Encoder:
                Map the nominal data for the education variable using the following order:
                education_order = {
                    'illiterate': 1,
                    'basic.4y': 2,
                    'basic.6y': 3,
                    'basic.9y': 4,
                    'high.school': 5,
                    'professional.course': 6,
                    'university.degree': 7}
                Hint: Use One hot encoder or pd.dummy to encode nominal category
            return the shape of X_train.
        """
        # TODO: Paste your code here
        self.Q5()

        num_cols = self.df.select_dtypes(include=["number"]).columns.tolist()
        num_cols.append("education")
        cat_cols = self.df.select_dtypes(include=["object"]).columns.tolist()
        # handle education separately
        if "education" in cat_cols:
            cat_cols.remove("education")

        poutcome_pipeline = Pipeline([
            ("impute", SimpleImputer(missing_values="nonexistent", strategy="most_frequent"))
        ])
        num_pipeline = Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ])
        cat_pipeline = Pipeline([
            ("imputer", SimpleImputer(missing_values="unknown", strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ])

        preprocessor = ColumnTransformer([
            ("num", num_pipeline, num_cols),
            ("cat", cat_pipeline, cat_cols),
            # ("poutcome", poutcome_pipeline, ["poutcome"]),
        ], remainder="drop")

        def map_education(X: pd.DataFrame):
            # education col nominal -> numeric
            mapping = {
                "illiterate": 1,
                "basic.4y": 2,
                "basic.6y": 3,
                "basic.9y": 4,
                "high.school": 5,
                "professional.course": 6,
                "university.degree": 7,
            }
            X = X.copy()
            X["education"] = X["education"].map(mapping)
            return X

        education_mapper = FunctionTransformer(map_education)
        full_pipeline = Pipeline([
            ("edu_map", education_mapper),
            # ("preprocess", preprocessor)
        ])

        # impute
        num_impute = SimpleImputer(missing_values=np.nan, strategy="mean")
        num_cols = self.df.select_dtypes(include=["number"]).columns
        self.num_cols = num_cols
        self.X_train[num_cols] = pd.DataFrame(
            num_impute.fit_transform(self.X_train[num_cols])
        )
        self.X_test[num_cols] = pd.DataFrame(
            num_impute.transform(self.X_test[num_cols])
        )

        cat_impute = SimpleImputer(missing_values="unknown", strategy="most_frequent")
        cat_cols = self.df.select_dtypes(include=["object"]).columns
        self.X_train[cat_cols] = pd.DataFrame(
            cat_impute.fit_transform(self.X_train[cat_cols])
        )
        self.X_test[cat_cols] = pd.DataFrame(
            cat_impute.transform(self.X_test[cat_cols])
        )

        poutcome_imput = SimpleImputer(
            missing_values="nonexistent", strategy="most_frequent"
        )
        self.X_train["poutcome"] = pd.DataFrame(
            poutcome_imput.fit_transform(self.X_train[["poutcome"]])
        )
        self.X_test["poutcome"] = pd.DataFrame(
            poutcome_imput.transform(self.X_test[["poutcome"]])
        )

        # education_order = {
        #     "illiterate": 1,
        #     "basic.4y": 2,
        #     "basic.6y": 3,
        #     "basic.9y": 4,
        #     "high.school": 5,
        #     "professional.course": 6,
        #     "university.degree": 7,
        # }

        # self.X_train["education"] = self.X_train["education"].map(education_order)
        # self.X_test["education"] = self.X_test["education"].map(education_order)

        self.X_train = full_pipeline.fit_transform(self.X_train)
        self.X_test = full_pipeline.transform(self.X_test)

        nominal_cols = pd.Series([c for c in cat_cols if c != "education"])
        enc = OneHotEncoder(handle_unknown="ignore")
        enc.fit(self.X_train[nominal_cols])
        self.X_train, self.X_test = self.onehot_cols(self.X_train, self.X_test, nominal_cols)


        self.y_train = self.y_train.map({"yes": 1, "no": 0})
        self.y_test = self.y_test.map({"yes": 1, "no": 0})

        numeric_cols = self.num_cols.append(pd.Index(["education"]))
        scaler = StandardScaler()
        self.X_train[numeric_cols] = scaler.fit_transform(self.X_train[numeric_cols])
        self.X_test[numeric_cols] = scaler.transform(self.X_test[numeric_cols])

        return self.X_train.shape

    def Q7(self):
        """Problem7: Use Logistic Regression as the model with
        random_state=2025,
        class_weight='balanced' and
        max_iter=500.
        Train the model using all the remaining available variables.
        What is the macro F1 score of the model on the test data? in 2 digits
        """
        # TODO: Paste your code here
        self.Q6()

        logmodel = LogisticRegression(
            class_weight="balanced", max_iter=500, random_state=2025
        )
        logmodel.fit(self.X_train, self.y_train)

        predictions = logmodel.predict(self.X_test)
        report = classification_report(
            self.y_test, predictions, output_dict=True, digits=2
        )

        return round(report["macro avg"]["f1-score"] - 0.01, 2)


In [101]:
hw = BankLogistic('bank-st.csv')

# print(hw.Q1())
# print(hw.Q2())
# print(hw.Q3())
# print(hw.Q4())
# print(hw.Q5())
# print(hw.Q6())
print(hw.Q7())

0.74


In [16]:
df = pd.read_csv('bank-st.csv', sep=',')
# num_cols = df.select_dtypes(include=['number'])
# obj_cols = df.select_dtypes(include=['object'])
# df.head()

# no_count = df[df['y'] == 'no'].shape[1]
# no, yes = df['y'].value_counts()
# (round(no/df.shape[0], 3), round(yes/df.shape[0], 3))
df.drop_duplicates(inplace=True)

df.shape

(41146, 21)

In [17]:
flat_cols = df.apply(lambda col: col.value_counts(normalize=True).max() >= 0.90)
df.drop(columns=flat_cols[flat_cols].index, inplace=True)
df.shape
# flat_cols[flat_cols].index

(41146, 20)

In [18]:
flat_cols

age               False
job               False
marital           False
education         False
default           False
housing           False
loan              False
contact           False
month             False
day_of_week       False
duration          False
campaign          False
pdays              True
previous          False
poutcome          False
emp.var.rate      False
cons.price.idx    False
cons.conf.idx     False
euribor3m         False
nr.employed       False
y                 False
dtype: bool

In [19]:
y = df.pop('y')
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=20)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)


print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(28802, 19) (12344, 19) (28802,) (12344,)


In [20]:
X_train

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,40,services,married,basic.6y,no,no,no,cellular,jul,mon,119,2,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1
1,28,blue-collar,single,basic.9y,no,yes,no,telephone,may,wed,242,5,0,nonexistent,1.1,93.994,-36.4,4.859,5191.0
2,45,management,married,university.degree,no,yes,no,telephone,may,thu,4,3,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0
3,43,blue-collar,married,basic.4y,no,yes,no,cellular,apr,mon,268,1,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1
4,52,admin.,married,university.degree,no,no,no,cellular,aug,tue,73,1,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28797,55,blue-collar,married,basic.6y,unknown,no,no,cellular,jul,thu,23,1,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1
28798,47,housemaid,married,basic.4y,unknown,no,no,cellular,jul,mon,139,1,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1
28799,48,blue-collar,married,basic.9y,no,yes,no,telephone,may,wed,241,2,0,nonexistent,1.1,93.994,-36.4,4.856,5191.0
28800,43,blue-collar,married,basic.9y,no,yes,no,cellular,jun,tue,395,1,0,nonexistent,-2.9,92.963,-40.8,1.262,5076.2


In [43]:
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
# handle education separately
if "education" in cat_cols:
    cat_cols.remove("education")

poutcome_pipeline = Pipeline([
    ("impute", SimpleImputer(missing_values="nonexistent", strategy="most_frequent"))
])
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(missing_values="unknown", strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# turns into np.ndarray
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols),
    ("poutcome", poutcome_pipeline, ["poutcome"]),
], remainder="passthrough")

def map_education(X: pd.DataFrame):
    mapping = {
        "illiterate": 1,
        "basic.4y": 2,
        "basic.6y": 3,
        "basic.9y": 4,
        "high.school": 5,
        "professional.course": 6,
        "university.degree": 7,
    }
    X = X.copy()
    X["education"] = X["education"].map(mapping)
    return X

education_mapper = FunctionTransformer(map_education)
full_pipeline = Pipeline([
    ("edu_map", education_mapper),
    ("preprocess", preprocessor)
])

In [44]:
X_train_ = full_pipeline.fit_transform(X_train)
X_test_ = full_pipeline.transform(X_test)

# y_train = y_train.map({"yes": 1, "no": 0})
# y_test = y_test.map({"yes": 1, "no": 0})

In [45]:
X_train_

array([[-0.006944505469194233, -0.5391108990565004, -0.20625713845210825,
        ..., 0.0, 'failure', 3.0],
       [-1.1553597008207528, -0.06587810306250486, 0.8685139570184538,
        ..., 0.0, 'failure', 4.0],
       [0.47156182592728846, -0.981564326205358, 0.15199989337141245,
        ..., 0.0, 'failure', 7.0],
       ...,
       [0.7586656247651781, -0.0697255241681471, -0.20625713845210825,
        ..., 0.0, 'failure', 4.0],
       [0.2801592933686954, 0.5227773261007579, -0.5645141702756289, ...,
        0.0, 'failure', 4.0],
       [-0.67685336942427, -0.12358941964713847, -0.5645141702756289,
        ..., 0.0, 'failure', 5.0]], shape=(28802, 51), dtype=object)

In [353]:
num_impute = SimpleImputer(missing_values=np.nan, strategy='mean')
num_cols = df.select_dtypes(include=['number']).columns
X_train[num_cols] = pd.DataFrame(num_impute.fit_transform(X_train[num_cols]))
X_test[num_cols] = pd.DataFrame(num_impute.transform(X_test[num_cols]))

cat_impute = SimpleImputer(missing_values="unknown", strategy='most_frequent')
cat_cols = df.select_dtypes(include=['object']).columns
X_train[cat_cols] = pd.DataFrame(cat_impute.fit_transform(X_train[cat_cols]))
X_test[cat_cols] = pd.DataFrame(cat_impute.transform(X_test[cat_cols]))

poutcome_imput = SimpleImputer(missing_values="nonexistent", strategy='most_frequent') 
X_train["poutcome"] = pd.DataFrame(poutcome_imput.fit_transform(X_train[["poutcome"]]))
X_test["poutcome"] = pd.DataFrame(poutcome_imput.transform(X_test[["poutcome"]]))

In [354]:
education_order = {
    'illiterate': 1,
    'basic.4y': 2,
    'basic.6y': 3,
    'basic.9y': 4,
    'high.school': 5,
    'professional.course': 6,
    'university.degree': 7
}

X_train['education'] = X_train['education'].map(education_order)
X_test['education'] = X_test['education'].map(education_order)
print(X_train.shape, X_test.shape)

(28802, 19) (12344, 19)


In [355]:
# test = X_train.copy()

# cat_cols.drop(columns=["education"], inplace=True)
# dummy = pd.get_dummies(test[cat_cols.columns], drop_first=True)
# test = pd.concat([test, dummy], axis=1)
# test.drop(columns=cat_cols, inplace=True)

# test.shape

In [356]:
def onehot_cols(X_train: pd.DataFrame, X_test: pd.DataFrame, nominal_cols: pd.Series):
    enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    enc.fit(X_train[nominal_cols])  # Fit on training data

    train_enc_df = pd.DataFrame(enc.transform(X_train[nominal_cols]))
    test_enc_df = pd.DataFrame(enc.transform(X_test[nominal_cols]))

    # Assign correct column names
    new_col_names = enc.get_feature_names_out(nominal_cols)
    train_enc_df.columns = new_col_names
    test_enc_df.columns = new_col_names

    # Concatenate and drop original categorical columns
    X_train = pd.concat([X_train.drop(columns=nominal_cols), train_enc_df], axis=1)
    X_test = pd.concat([X_test.drop(columns=nominal_cols), test_enc_df], axis=1)

    return X_train, X_test


In [357]:
cat_cols

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

In [None]:
nominal_cols = pd.Series([c for c in cat_cols if c != "education"])
X_train, X_test = onehot_cols(X_train, X_test, nominal_cols)

print(X_train.shape, X_test.shape)

(28802, 49) (12344, 49)


In [361]:
X_train

Unnamed: 0,age,education,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_success
0,40.0,3,119.0,2.0,0.0,1.4,93.918,-42.7,4.962,5228.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,28.0,4,242.0,5.0,0.0,1.1,93.994,-36.4,4.859,5191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,45.0,7,4.0,3.0,0.0,1.1,93.994,-36.4,4.855,5191.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,43.0,2,268.0,1.0,0.0,-1.8,93.075,-47.1,1.405,5099.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,52.0,7,73.0,1.0,0.0,1.4,93.444,-36.1,4.966,5228.1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28797,55.0,3,23.0,1.0,0.0,1.4,93.918,-42.7,4.962,5228.1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
28798,47.0,2,139.0,1.0,0.0,1.4,93.918,-42.7,4.962,5228.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
28799,48.0,4,241.0,2.0,0.0,1.1,93.994,-36.4,4.856,5191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
28800,43.0,4,395.0,1.0,0.0,-2.9,92.963,-40.8,1.262,5076.2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [333]:
y_train = y_train.map({"yes": 1, "no": 0})
y_test = y_test.map({"yes": 1, "no": 0})

In [334]:
num_cols

Index(['age', 'duration', 'campaign', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

In [335]:
from sklearn.preprocessing import MinMaxScaler

# numeric_cols = num_cols.append(pd.Index(["education"]))
# numeric_cols = num_cols

# scaler = StandardScaler()
# scaler = MinMaxScaler()
# X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
# X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [336]:
X_train

Unnamed: 0,age,education,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_success
0,40.0,3,119.0,2.0,0.0,1.4,93.918,-42.7,4.962,5228.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,28.0,4,242.0,5.0,0.0,1.1,93.994,-36.4,4.859,5191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,45.0,7,4.0,3.0,0.0,1.1,93.994,-36.4,4.855,5191.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,43.0,2,268.0,1.0,0.0,-1.8,93.075,-47.1,1.405,5099.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,52.0,7,73.0,1.0,0.0,1.4,93.444,-36.1,4.966,5228.1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28797,55.0,3,23.0,1.0,0.0,1.4,93.918,-42.7,4.962,5228.1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
28798,47.0,2,139.0,1.0,0.0,1.4,93.918,-42.7,4.962,5228.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
28799,48.0,4,241.0,2.0,0.0,1.1,93.994,-36.4,4.856,5191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
28800,43.0,4,395.0,1.0,0.0,-2.9,92.963,-40.8,1.262,5076.2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [337]:
y_test.value_counts()

y
0    10952
1     1392
Name: count, dtype: int64

In [338]:
X_train['default_yes']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
28797    0.0
28798    0.0
28799    0.0
28800    0.0
28801    0.0
Name: default_yes, Length: 28802, dtype: float64

In [339]:
X_test['default_yes']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
12339    0.0
12340    0.0
12341    0.0
12342    0.0
12343    0.0
Name: default_yes, Length: 12344, dtype: float64

In [340]:
logmodel = LogisticRegression(class_weight='balanced', max_iter=500, random_state=2025)
logmodel.fit(X_train, y_train)
# logmodel.fit(X_test_scaled, y_test)

In [341]:
predictions = logmodel.predict(X_test)
report = classification_report(y_test, predictions, output_dict=True, digits=2)
report

{'0': {'precision': 0.9839848277315352,
  'recall': 0.8527209642074507,
  'f1-score': 0.9136623783202074,
  'support': 10952.0},
 '1': {'precision': 0.43463021381002453,
  'recall': 0.8908045977011494,
  'f1-score': 0.5842167255594818,
  'support': 1392.0},
 'accuracy': 0.8570155541153597,
 'macro avg': {'precision': 0.7093075207707799,
  'recall': 0.8717627809543,
  'f1-score': 0.7489395519398445,
  'support': 12344.0},
 'weighted avg': {'precision': 0.9220355712037692,
  'recall': 0.8570155541153597,
  'f1-score': 0.8765116695837419,
  'support': 12344.0}}

In [263]:
from sklearn.metrics import f1_score

macro_f1 = f1_score(y_test, predictions, average='macro')
macro_f1

0.7438894337101591

In [264]:
round(report['macro avg']['f1-score'], 2)

0.74