In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Устанавливаем seed для воспроизводимости результатов, как указано в задании

In [25]:
RANDOM_STATE = 1
pd.options.display.float_format = '{:,.4f}'.format

#Подготовка набора данных

In [26]:
df = pd.read_csv('bank-full.csv', sep=';')


selected_cols = [
    'age', 'job', 'marital', 'education', 'balance', 'housing', 'loan',
    'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous',
    'poutcome', 'y'
]
df = df[selected_cols]


df['y'] = (df['y'] == 'yes').astype(int)


df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=RANDOM_STATE)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

y_train = df_train['y'].values
y_val = df_val['y'].values
y_full_train = df_full_train['y'].values

del df_train['y']
del df_val['y']
del df_full_train['y']

#Вопрос 1

In [27]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
numerical_q1 = ['age', 'duration', 'campaign', 'pdays']
results_q1 = {}

for col in numerical_q1:
    auc = roc_auc_score(y_train, df_train[col])

    if auc < 0.5:
         auc = 1 - auc
    results_q1[col] = auc

least_important_feature = min(results_q1, key=results_q1.get)


#Подготовка данных для обучения модели (One-Hot Encoding)

In [28]:
categorical = [
    'job', 'marital', 'education', 'housing', 'loan',
    'contact', 'month', 'poutcome'
]

dv = DictVectorizer(sparse=False)
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Вопрос 2

In [29]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=RANDOM_STATE)
model.fit(X_train, y_train)

y_proba_val = model.predict_proba(X_val)[:, 1]


y_pred_val_05 = (y_proba_val >= 0.5).astype(int)
accuracy_q2 = accuracy_score(y_val, y_pred_val_05)


# Вопрос 3

In [30]:
P_q3 = precision_score(y_val, y_pred_val_05)
R_q3 = recall_score(y_val, y_pred_val_05)
metric_q3 = P_q3 / (P_q3 + R_q3)


# Вопрос 4

In [31]:
thresholds = np.linspace(0.0, 1.0, 101)
f1_scores = [f1_score(y_val, (y_proba_val >= t).astype(int)) for t in thresholds]
optimal_threshold_q4 = thresholds[np.argmax(f1_scores)]


# Вопрос 5

In [32]:
def train_and_evaluate_cv(df_full_train, y_full_train, C_list):


    kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    results = []

    for C in C_list:
        auc_scores = []

        for train_idx, val_idx in kfold.split(df_full_train):
            df_train_fold = df_full_train.iloc[train_idx]
            df_val_fold = df_full_train.iloc[val_idx]

            y_train_fold = y_full_train[train_idx]
            y_val_fold = y_full_train[val_idx]

            train_dicts = df_train_fold[categorical + numerical].to_dict(orient='records')
            val_dicts = df_val_fold[categorical + numerical].to_dict(orient='records')

            dv_fold = DictVectorizer(sparse=False)
            X_train_fold = dv_fold.fit_transform(train_dicts)
            X_val_fold = dv_fold.transform(val_dicts)

            model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=RANDOM_STATE)
            model.fit(X_train_fold, y_train_fold)

            y_proba_val_fold = model.predict_proba(X_val_fold)[:, 1]
            auc = roc_auc_score(y_val_fold, y_proba_val_fold)
            auc_scores.append(auc)

        results.append({
            'C': C,
            'mean_auc': np.mean(auc_scores),
            'std_auc': np.std(auc_scores),
            'auc_scores': auc_scores
        })
    return results


cv_results_q5 = train_and_evaluate_cv(df_full_train, y_full_train, C_list=[1.0])[0]
std_auc_q5 = cv_results_q5['std_auc']


# Вопрос 6

In [33]:
C_values_q6 = [0.000001, 0.001, 0.01, 0.1, 1, 10, 200]
cv_results_q6 = train_and_evaluate_cv(df_full_train, y_full_train, C_list=C_values_q6)

results_df_q6 = pd.DataFrame(cv_results_q6)
best_C_row_q6 = results_df_q6.loc[results_df_q6['mean_auc'].idxmax()]
best_C_q6 = best_C_row_q6['C']


#Финальные результаты

================================================================================
ФИНАЛЬНЫЕ РЕЗУЛЬТАТЫ
================================================================================
Вопрос 1
| Признак   |      AUC |
|:----------|---------:|
| age       | 0.512186 |
| duration  | 0.8147   |
| campaign  | 0.571454 |
| pdays     | 0.590128 |

Наименее важный признак (с наименьшим AUC): age

--------------------------------------------------------------------------------
Вопрос 2
Рассчитанная Accuracy: 0.8978
Наиболее близкий ответ: 0.90

--------------------------------------------------------------------------------
Вопрос 3
Precision (P): 0.6576
Recall (R):    0.3306
P / (P + R):   0.6655
Наиболее близкий ответ: 0.6

--------------------------------------------------------------------------------
Вопрос 4
Максимальный F1-score: 0.5680
Оптимальный порог F1: 0.22

--------------------------------------------------------------------------------
Вопрос 5
Оценки AUC по фолдам: [0.90041211 0.90074742 0.91164211 0.90887743 0.91236001]
Стандартное отклонение STD(AUC): 0.005217547683025359
Наиболее близкий ответ: 0.006

--------------------------------------------------------------------------------
Вопрос 6
|       C |   Средний AUC |   STD(AUC) |
|--------:|--------------:|-----------:|
|   1e-06 |      0.701491 | 0.00944803 |
|   0.001 |      0.862979 | 0.0073505  |
|   0.01  |      0.902913 | 0.00427877 |
|   0.1   |      0.90692  | 0.00531896 |
|   1     |      0.906808 | 0.00521755 |
|  10     |      0.906665 | 0.00533371 |
| 200     |      0.906788 | 0.00541947 |

Наилучший параметр C (по максимальному среднему AUC): 0.1
================================================================================
