In [220]:
import pandas as pd
import numpy as np
import warnings # DO NOT modify this line
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.exceptions import ConvergenceWarning # DO NOT modify this line
warnings.filterwarnings("ignore", category=ConvergenceWarning) # DO NOT modify this line


class BankLogistic:
    def __init__(self, data_path): # DO NOT modify this line
        self.data_path = data_path
        self.df = pd.read_csv(data_path, sep=',')
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None

    def Q1(self): # DO NOT modify this line
        """
        Problem 1:
            Load ‘bank-st.csv’ data from the “Attachment”
            How many rows of data are there in total?

        """
        # TODO: Paste your code here
        return self.df.shape[0]
    

    def Q2(self): # DO NOT modify this line
        """
        Problem 2:
            return the tuple of numeric variables and categorical variables are presented in the dataset.
        """
        # TODO: Paste your code here
        num_cols = self.df.select_dtypes(include=['number'])
        obj_cols = self.df.select_dtypes(include=['object'])

        return (num_cols.shape[1], obj_cols.shape[1])
    
    
    def Q3(self): # DO NOT modify this line
        """
        Problem 3:
            return the tuple of the Class 0 (no) followed by Class 1 (yes) in 3 digits.
        """
        # TODO: Paste your code here
        no, yes = self.df['y'].value_counts()
        return (round(no / self.df.shape[0], 3), round(yes / self.df.shape[0], 3))
    

    def Q4(self): # DO NOT modify this line
        """
        Problem 4:
            Remove duplicate records from the data. What are the shape of the dataset afterward?
        """
        # TODO: Paste your code here
        self.df.drop_duplicates(inplace=True)
        
        return self.df.shape  
        

    def Q5(self): # DO NOT modify this line
        """
        Problem 5:
            5. Replace unknown value with null
            6. Remove features with more than 99% flat values. 
                Hint: There is only one feature should be drop
            7. Split Data
            -	Split the dataset into training and testing sets with a 70:30 ratio.
            -	random_state=0
            -	stratify option
            return the tuple of shapes of X_train and X_test.

        """
        # TODO: Paste your code here
        self.df.drop_duplicates(inplace=True)
        
        flat_cols = self.df.apply(lambda col: col.value_counts(normalize=True).max() >= 0.90)
        self.df.drop(columns=flat_cols[flat_cols].index, inplace=True)

        y = self.df.pop('y')
        X = self.df

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=0)
        self.X_train.reset_index(drop=True, inplace=True)
        self.X_test.reset_index(drop=True, inplace=True)
        self.y_train.reset_index(drop=True, inplace=True)
        self.y_test.reset_index(drop=True, inplace=True)

        return self.X_train.shape, self.X_test.shape


    def onehot_cols(self, X: pd.DataFrame, nominal_cols: pd.Series):
        enc = OneHotEncoder(handle_unknown='ignore')
        enc_df = pd.DataFrame(enc.fit_transform(X[nominal_cols]).toarray())

        unique_vals = enc.categories_
        new_col_names = []
        cou = 0
        for i, vals in enumerate(unique_vals):
            cou += len(vals)
            for j, val in enumerate(vals):
                new_col_names.append(f"{nominal_cols[i]}_{val}")

        enc_df.columns = new_col_names
        X = pd.concat([X, enc_df], axis=1)
        X.drop(columns=nominal_cols, inplace=True)

        return X


    def Q6(self): 
        """
        Problem 6: 
            8. Impute missing
                -	For numeric variables: Impute missing values using the mean.
                -	For categorical variables: Impute missing values using the mode.
                Hint: Use statistics calculated from the training dataset to avoid data leakage.
            9. Categorical Encoder:
                Map the nominal data for the education variable using the following order:
                education_order = {
                    'illiterate': 1,
                    'basic.4y': 2,
                    'basic.6y': 3,
                    'basic.9y': 4,
                    'high.school': 5,
                    'professional.course': 6,
                    'university.degree': 7} 
                Hint: Use One hot encoder or pd.dummy to encode nominal category
            return the shape of X_train.
        """
        # TODO: Paste your code here
        self.Q5()

        # impute
        num_impute = SimpleImputer(missing_values=np.nan, strategy='mean')
        num_cols = self.df.select_dtypes(include=['number']).columns
        self.X_train[num_cols] = pd.DataFrame(num_impute.fit_transform(self.X_train[num_cols]))
        self.X_test[num_cols] = pd.DataFrame(num_impute.transform(self.X_test[num_cols]))

        cat_impute = SimpleImputer(missing_values="unknown", strategy='most_frequent')
        cat_cols = self.df.select_dtypes(include=['object']).columns
        self.X_train[cat_cols] = pd.DataFrame(cat_impute.fit_transform(self.X_train[cat_cols]))
        self.X_test[cat_cols] = pd.DataFrame(cat_impute.transform(self.X_test[cat_cols]))

        poutcome_imput = SimpleImputer(missing_values="nonexistent", strategy='most_frequent') 
        self.X_train["poutcome"] = pd.DataFrame(poutcome_imput.fit_transform(self.X_train[["poutcome"]]))
        self.X_test["poutcome"] = pd.DataFrame(poutcome_imput.transform(self.X_test[["poutcome"]]))
        
        # education col nominal -> numeric
        education_order = {
            'illiterate': 1,
            'basic.4y': 2,
            'basic.6y': 3,
            'basic.9y': 4,
            'high.school': 5,
            'professional.course': 6,
            'university.degree': 7
        }

        self.num_cols = num_cols

        self.X_train['education'] = self.X_train['education'].map(education_order)
        self.X_test['education'] = self.X_test['education'].map(education_order)

        nominal_cols = pd.Series([c for c in cat_cols if c != "education"])
        self.X_train = self.onehot_cols(self.X_train, nominal_cols)
        self.X_test = self.onehot_cols(self.X_test, nominal_cols)

        self.y_train = self.y_train.map({"yes": 1, "no": 0})
        self.y_test = self.y_test.map({"yes": 1, "no": 0})
        
        return self.X_train.shape
    

    def Q7(self):
        ''' Problem7: Use Logistic Regression as the model with 
            random_state=2025, 
            class_weight='balanced' and 
            max_iter=500. 
            Train the model using all the remaining available variables. 
            What is the macro F1 score of the model on the test data? in 2 digits
        '''
        # TODO: Paste your code here
        self.Q6()

        # numeric_cols = self.num_cols.append(pd.Index(["education"]))
        # scaler = StandardScaler()
        # self.X_train[numeric_cols] = scaler.fit_transform(self.X_train[numeric_cols])
        # self.X_test[numeric_cols] = scaler.transform(self.X_test[numeric_cols])

        logmodel = LogisticRegression(class_weight='balanced', max_iter=500, random_state=2025)
        logmodel.fit(self.X_train, self.y_train)

        predictions = logmodel.predict(self.X_test)
        report = classification_report(self.y_test, predictions, output_dict=True, digits=2)

        return round(report['macro avg']['f1-score'], 2)  
        

In [221]:
hw = BankLogistic('bank-st.csv')

# print(hw.Q1())
# print(hw.Q2())
# print(hw.Q3())
# print(hw.Q4())
# print(hw.Q5())
# print(hw.Q6())
print(hw.Q7())

0.74


In [197]:
df = pd.read_csv('bank-st.csv', sep=',')
# num_cols = df.select_dtypes(include=['number'])
# obj_cols = df.select_dtypes(include=['object'])
# df.head()

# no_count = df[df['y'] == 'no'].shape[1]
# no, yes = df['y'].value_counts()
# (round(no/df.shape[0], 3), round(yes/df.shape[0], 3))
df.drop_duplicates(inplace=True)

df.shape

(41146, 21)

In [198]:
_flat = df.apply(lambda col: col.value_counts(normalize=True).max())
_flat

age               0.047319
job               0.253147
marital           0.605162
education         0.295557
default           0.791280
housing           0.523891
loan              0.824260
contact           0.635177
month             0.333860
day_of_week       0.209449
duration          0.004132
campaign          0.427867
pdays             0.963180
previous          0.863292
poutcome          0.863292
emp.var.rate      0.394400
cons.price.idx    0.187916
cons.conf.idx     0.187916
euribor3m         0.068974
nr.employed       0.394400
y                 0.887255
dtype: float64

In [199]:
flat_cols = df.apply(lambda col: col.value_counts(normalize=True).max() >= 0.90)
df.drop(columns=flat_cols[flat_cols].index, inplace=True)
df.shape
# flat_cols[flat_cols].index

(41146, 20)

In [200]:
y = df.pop('y')
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=0)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)


print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(28802, 19) (12344, 19) (28802,) (12344,)


In [201]:
y_train.value_counts()

y
no     25555
yes     3247
Name: count, dtype: int64

In [202]:
X_train

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,29,technician,single,professional.course,no,yes,no,cellular,jul,tue,62,3,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1
1,53,admin.,divorced,basic.9y,no,no,yes,cellular,may,fri,164,1,0,nonexistent,-1.8,92.893,-46.2,1.250,5099.1
2,46,technician,divorced,professional.course,no,no,no,cellular,aug,wed,308,2,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1
3,36,admin.,single,high.school,no,yes,no,cellular,may,thu,235,1,0,nonexistent,-1.8,92.893,-46.2,1.266,5099.1
4,28,services,single,high.school,no,no,no,telephone,may,thu,32,2,0,nonexistent,-1.8,92.893,-46.2,1.270,5099.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28797,41,blue-collar,divorced,basic.4y,unknown,yes,no,telephone,may,mon,1575,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
28798,44,services,married,high.school,no,no,no,cellular,nov,wed,37,1,1,failure,-0.1,93.200,-42.0,4.120,5195.8
28799,33,housemaid,married,basic.9y,no,yes,no,cellular,aug,fri,147,1,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1
28800,45,admin.,married,university.degree,no,no,no,cellular,sep,mon,587,1,0,nonexistent,-3.4,92.379,-29.8,0.797,5017.5


In [203]:
X_train['marital'].value_counts()

marital
married     17429
single       8066
divorced     3249
unknown        58
Name: count, dtype: int64

In [204]:
num_impute = SimpleImputer(missing_values=np.nan, strategy='mean')
num_cols = df.select_dtypes(include=['number']).columns
X_train[num_cols] = pd.DataFrame(num_impute.fit_transform(X_train[num_cols]))
X_test[num_cols] = pd.DataFrame(num_impute.transform(X_test[num_cols]))

cat_impute = SimpleImputer(missing_values="unknown", strategy='most_frequent')
cat_cols = df.select_dtypes(include=['object']).columns
X_train[cat_cols] = pd.DataFrame(cat_impute.fit_transform(X_train[cat_cols]))
X_test[cat_cols] = pd.DataFrame(cat_impute.transform(X_test[cat_cols]))

poutcome_imput = SimpleImputer(missing_values="nonexistent", strategy='most_frequent') 
X_train["poutcome"] = pd.DataFrame(poutcome_imput.fit_transform(X_train[["poutcome"]]))
X_test["poutcome"] = pd.DataFrame(poutcome_imput.transform(X_test[["poutcome"]]))

In [205]:
X_train['marital'].value_counts()

marital
married     17487
single       8066
divorced     3249
Name: count, dtype: int64

In [206]:
education_order = {
    'illiterate': 1,
    'basic.4y': 2,
    'basic.6y': 3,
    'basic.9y': 4,
    'high.school': 5,
    'professional.course': 6,
    'university.degree': 7
}

X_train['education'] = X_train['education'].map(education_order)
X_test['education'] = X_test['education'].map(education_order)
print(X_train.shape, X_test.shape)

(28802, 19) (12344, 19)


In [207]:
# test = X_train.copy()

# cat_cols.drop(columns=["education"], inplace=True)
# dummy = pd.get_dummies(test[cat_cols.columns], drop_first=True)
# test = pd.concat([test, dummy], axis=1)
# test.drop(columns=cat_cols, inplace=True)

# test.shape

In [208]:
def onehot_cols(X: pd.DataFrame, nominal_cols: pd.Series):
    enc = OneHotEncoder(handle_unknown='ignore')
    enc_df = pd.DataFrame(enc.fit_transform(X[nominal_cols]).toarray())

    unique_vals = enc.categories_
    new_col_names = []
    cou = 0
    # print(f"total: {len(np.concatenate(unique_vals))}")
    for i, vals in enumerate(unique_vals):
        # print(f"{nominal_cols[i]} ({len(vals)}): ", end="")
        cou += len(vals)
        for j, val in enumerate(vals):
            # print(f"{val}, ", end="")
            new_col_names.append(f"{nominal_cols[i]}_{val}")
        # print()

    enc_df.columns = new_col_names
    X = pd.concat([X, enc_df], axis=1)
    X.drop(columns=nominal_cols, inplace=True)
    
    return X

In [209]:
cat_cols

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

In [210]:
nominal_cols = pd.Series([c for c in cat_cols if c != "education"])
X_train = onehot_cols(X_train, nominal_cols)
X_test = onehot_cols(X_test, nominal_cols)

In [211]:
y_train = y_train.map({"yes": 1, "no": 0})
y_test = y_test.map({"yes": 1, "no": 0})

In [212]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
28797    1
28798    0
28799    0
28800    1
28801    1
Name: y, Length: 28802, dtype: int64

In [213]:
num_cols

Index(['age', 'duration', 'campaign', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

In [214]:
from sklearn.preprocessing import MinMaxScaler

numeric_cols = num_cols.append(pd.Index(["education"]))
# numeric_cols = num_cols

# scaler = StandardScaler()
scaler = MinMaxScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [215]:
X_train

Unnamed: 0,age,education,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_success
0,0.148148,0.833333,0.012607,0.047619,0.000000,1.000000,0.669135,0.338912,0.980957,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.444444,0.500000,0.033347,0.000000,0.000000,0.333333,0.269680,0.192469,0.139651,0.512287,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.358025,0.833333,0.062627,0.023810,0.000000,1.000000,0.484412,0.615063,0.981637,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.234568,0.666667,0.047784,0.000000,0.000000,0.333333,0.269680,0.192469,0.143278,0.512287,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.135802,0.666667,0.006507,0.023810,0.000000,0.333333,0.269680,0.192469,0.144185,0.512287,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28797,0.296296,0.166667,0.320252,0.000000,0.000000,0.937500,0.698753,0.602510,0.957379,0.859735,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
28798,0.333333,0.666667,0.007523,0.000000,0.142857,0.687500,0.389322,0.368201,0.790297,0.877883,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
28799,0.197531,0.500000,0.029890,0.000000,0.000000,1.000000,0.484412,0.615063,0.982090,1.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
28800,0.345679,1.000000,0.119357,0.000000,0.000000,0.000000,0.069369,0.878661,0.036953,0.203781,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [216]:
y_test.value_counts()

y
0    10952
1     1392
Name: count, dtype: int64

In [217]:
X_train

Unnamed: 0,age,education,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_success
0,0.148148,0.833333,0.012607,0.047619,0.000000,1.000000,0.669135,0.338912,0.980957,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.444444,0.500000,0.033347,0.000000,0.000000,0.333333,0.269680,0.192469,0.139651,0.512287,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.358025,0.833333,0.062627,0.023810,0.000000,1.000000,0.484412,0.615063,0.981637,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.234568,0.666667,0.047784,0.000000,0.000000,0.333333,0.269680,0.192469,0.143278,0.512287,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.135802,0.666667,0.006507,0.023810,0.000000,0.333333,0.269680,0.192469,0.144185,0.512287,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28797,0.296296,0.166667,0.320252,0.000000,0.000000,0.937500,0.698753,0.602510,0.957379,0.859735,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
28798,0.333333,0.666667,0.007523,0.000000,0.142857,0.687500,0.389322,0.368201,0.790297,0.877883,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
28799,0.197531,0.500000,0.029890,0.000000,0.000000,1.000000,0.484412,0.615063,0.982090,1.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
28800,0.345679,1.000000,0.119357,0.000000,0.000000,0.000000,0.069369,0.878661,0.036953,0.203781,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [218]:
logmodel = LogisticRegression(class_weight='balanced', max_iter=500, random_state=2025)
logmodel.fit(X_train, y_train)
# logmodel.fit(X_test_scaled, y_test)

In [219]:
predictions = logmodel.predict(X_test)
report = classification_report(y_test, predictions, output_dict=True, digits=2)
report

{'0': {'precision': 0.9803757828810021,
  'recall': 0.8575602629656683,
  'f1-score': 0.9148646015975064,
  'support': 10952.0},
 '1': {'precision': 0.4356005788712012,
  'recall': 0.8649425287356322,
  'f1-score': 0.5794032723772858,
  'support': 1392.0},
 'accuracy': 0.8583927414128322,
 'macro avg': {'precision': 0.7079881808761017,
  'recall': 0.8612513958506502,
  'f1-score': 0.7471339369873962,
  'support': 12344.0},
 'weighted avg': {'precision': 0.91894293421107,
  'recall': 0.8583927414128322,
  'f1-score': 0.8770355210503136,
  'support': 12344.0}}

In [23]:
from sklearn.metrics import f1_score

macro_f1 = f1_score(y_test, predictions, average='macro')
macro_f1

0.523783562322744

In [264]:
round(report['macro avg']['f1-score'], 2)

0.52