In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

df = pd.read_csv('bank-full.csv', delimiter=';')

features = ['age', 'job', 'marital', 'education', 'balance', 'housing',
           'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
           'previous', 'poutcome', 'y']
df = df[features]

print(f"Размер датасета: {df.shape}")
df.head()

Размер датасета: (45211, 15)


Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [28]:
print("Пропущенные значения:")
print(df.isnull().sum())

education_counts = df['education'].value_counts()
print("Распределение значений в столбце education:")
print(education_counts)

print(f"\nСамое частое значение: {education_counts.index[0]}")

Пропущенные значения:
age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
Распределение значений в столбце education:
education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

Самое частое значение: secondary


In [29]:
numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

correlation_matrix = df[numeric_features].corr()

print("Корреляционная матрица:")
print(correlation_matrix.round(3))

corr_pairs = correlation_matrix.unstack()
corr_pairs = corr_pairs[corr_pairs.index.get_level_values(0) != corr_pairs.index.get_level_values(1)]

corr_pairs_sorted = corr_pairs.abs().sort_values(ascending=False)

print("Пары признаков с наибольшей корреляцией:")
for i in range(5):
    pair = corr_pairs_sorted.index[i]
    value = corr_pairs.loc[pair]
    print(f"{pair}: {value:.3f}")

max_corr_pair = corr_pairs_sorted.index[0]
max_corr_value = corr_pairs.loc[max_corr_pair]
print(f"\nНаибольшая корреляция: {max_corr_pair} = {max_corr_value:.3f}")


Корреляционная матрица:
            age  balance    day  duration  campaign  pdays  previous
age       1.000    0.098 -0.009    -0.005     0.005 -0.024     0.001
balance   0.098    1.000  0.005     0.022    -0.015  0.003     0.017
day      -0.009    0.005  1.000    -0.030     0.162 -0.093    -0.052
duration -0.005    0.022 -0.030     1.000    -0.085 -0.002     0.001
campaign  0.005   -0.015  0.162    -0.085     1.000 -0.089    -0.033
pdays    -0.024    0.003 -0.093    -0.002    -0.089  1.000     0.455
previous  0.001    0.017 -0.052     0.001    -0.033  0.455     1.000
Пары признаков с наибольшей корреляцией:
('previous', 'pdays'): 0.455
('pdays', 'previous'): 0.455
('campaign', 'day'): 0.162
('day', 'campaign'): 0.162
('age', 'balance'): 0.098

Наибольшая корреляция: ('previous', 'pdays') = 0.455


In [30]:
df['y_encoded'] = df['y'].map({'yes': 1, 'no': 0})

X = df.drop(['y', 'y_encoded'], axis=1)
y = df['y_encoded']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"Размер тренировочного набора: {X_train.shape}")
print(f"Размер валидационного набора: {X_val.shape}")
print(f"Размер тестового набора: {X_test.shape}")

categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

X_train_encoded = pd.get_dummies(X_train[categorical_features], drop_first=True)

print(f"Размерность после one-hot кодирования: {X_train_encoded.shape}")

mi_scores = mutual_info_classif(X_train_encoded, y_train, random_state=42)

mi_df = pd.DataFrame({
    'feature': X_train_encoded.columns,
    'mi_score': mi_scores
})
mi_df = mi_df.sort_values('mi_score', ascending=False)

print("Взаимная информация для one-hot закодированных признаков:")
print(mi_df.head(10))

mi_by_original_feature = {}

for cat_feature in categorical_features:
    cat_cols = [col for col in X_train_encoded.columns if col.startswith(cat_feature + '_')]
    if cat_cols:
        cat_mi = mi_df[mi_df['feature'].isin(cat_cols)]['mi_score'].sum()
    else:
        cat_cols = [col for col in X_train_encoded.columns if col == cat_feature]
        if cat_cols:
            cat_mi = mi_df[mi_df['feature'].isin(cat_cols)]['mi_score'].sum()
        else:
            cat_mi = 0

    mi_by_original_feature[cat_feature] = cat_mi

mi_original_df = pd.DataFrame({
    'feature': list(mi_by_original_feature.keys()),
    'mi_score': list(mi_by_original_feature.values())
}).sort_values('mi_score', ascending=False)

print("Взаимная информация по исходным категориальным признакам:")
for feature, score in mi_by_original_feature.items():
    print(f"{feature}: {score:.2f}")


max_mi_feature = mi_original_df.iloc[0]['feature']
print(f"\nПризнак с наибольшей взаимной информацией: {max_mi_feature}")

def prepare_data(X):
    X_encoded = pd.get_dummies(X, drop_first=True)
    return X_encoded

X_train_encoded = prepare_data(X_train)
X_val_encoded = prepare_data(X_val)
X_test_encoded = prepare_data(X_test)

all_columns = X_train_encoded.columns
X_val_encoded = X_val_encoded.reindex(columns=all_columns, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=all_columns, fill_value=0)

X shape: (45211, 14)
y shape: (45211,)
Размер тренировочного набора: (27126, 14)
Размер валидационного набора: (9042, 14)
Размер тестового набора: (9043, 14)
Размерность после one-hot кодирования: (27126, 33)
Взаимная информация для one-hot закодированных признаков:
                feature  mi_score
31     poutcome_success  0.027390
16          housing_yes  0.015617
32     poutcome_unknown  0.015568
18      contact_unknown  0.014032
28            month_oct  0.007311
26            month_may  0.006068
11      marital_married  0.005793
14   education_tertiary  0.004337
13  education_secondary  0.003980
25            month_mar  0.003404
Взаимная информация по исходным категориальным признакам:
job: 0.01
marital: 0.01
education: 0.01
housing: 0.02
contact: 0.01
month: 0.03
poutcome: 0.04

Признак с наибольшей взаимной информацией: poutcome


In [31]:
model = LogisticRegression(
    solver='liblinear',
    C=1.0,
    max_iter=1000,
    random_state=42
)

model.fit(X_train_encoded, y_train)

y_val_pred = model.predict(X_val_encoded)

accuracy = accuracy_score(y_val, y_val_pred)
print(f"Точность на валидационном наборе: {accuracy:.3f}")

from sklearn.metrics import confusion_matrix, classification_report

print("\nМатрица ошибок:")
print(confusion_matrix(y_val, y_val_pred))

print("\nОтчет по классификации:")
print(classification_report(y_val, y_val_pred))

base_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Исходная точность: {base_accuracy:.4f}")

features_to_test = ['age', 'balance', 'marital', 'previous']
differences = {}

for feature in features_to_test:
    print(f"\nИсключаем признак: {feature}")

    X_train_reduced = X_train.drop(feature, axis=1)
    X_val_reduced = X_val.drop(feature, axis=1)

    X_train_reduced_encoded = prepare_data(X_train_reduced)
    X_val_reduced_encoded = prepare_data(X_val_reduced)

    reduced_columns = X_train_reduced_encoded.columns
    X_val_reduced_encoded = X_val_reduced_encoded.reindex(columns=reduced_columns, fill_value=0)

    model_reduced = LogisticRegression(
        solver='liblinear',
        C=1.0,
        max_iter=1000,
        random_state=42
    )
    model_reduced.fit(X_train_reduced_encoded, y_train)

    y_val_pred_reduced = model_reduced.predict(X_val_reduced_encoded)
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)

    difference = base_accuracy - accuracy_reduced
    differences[feature] = difference

    print(f"Точность без {feature}: {accuracy_reduced:.4f}")
    print(f"Разница: {difference:.4f}")

results_df = pd.DataFrame({
    'feature': differences.keys(),
    'accuracy_difference': differences.values(),
    'abs_difference': [abs(x) for x in differences.values()]
}).sort_values('abs_difference')

print("\nРезультаты feature elimination:")
print(results_df)



Точность на валидационном наборе: 0.904

Матрица ошибок:
[[7801  183]
 [ 689  369]]

Отчет по классификации:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      7984
           1       0.67      0.35      0.46      1058

    accuracy                           0.90      9042
   macro avg       0.79      0.66      0.70      9042
weighted avg       0.89      0.90      0.89      9042

Исходная точность: 0.9036

Исключаем признак: age
Точность без age: 0.9027
Разница: 0.0009

Исключаем признак: balance
Точность без balance: 0.9036
Разница: 0.0000

Исключаем признак: marital
Точность без marital: 0.9023
Разница: 0.0012

Исключаем признак: previous
Точность без previous: 0.9026
Разница: 0.0010

Результаты feature elimination:
    feature  accuracy_difference  abs_difference
1   balance             0.000000        0.000000
0       age             0.000885        0.000885
3  previous             0.000995        0.000995
2   marital            

In [32]:

C_values = [0.01, 0.1, 1, 10, 100]
accuracy_results = []

print("Результаты для разных значений C:")
for C in C_values:
    model = LogisticRegression(
        solver='liblinear',
        C=C,
        max_iter=1000,
        random_state=42
    )
    model.fit(X_train_encoded, y_train)

    y_val_pred = model.predict(X_val_encoded)

    accuracy = accuracy_score(y_val, y_val_pred)
    accuracy_results.append(accuracy)

    print(f"C = {C}: точность = {accuracy:.3f}")

best_idx = np.argmax(accuracy_results)
best_C = C_values[best_idx]
best_accuracy = accuracy_results[best_idx]

print(f"\nЛучшее значение C: {best_C} с точностью {best_accuracy:.3f}")




Результаты для разных значений C:
C = 0.01: точность = 0.899
C = 0.1: точность = 0.903
C = 1: точность = 0.904
C = 10: точность = 0.903
C = 100: точность = 0.902

Лучшее значение C: 1 с точностью 0.904


Ответы на вопросы

1 - secondary

2 - pdays и previous

3 - poutcome

4 - 0.9

5 - balance

6 - 1
