# Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

# Загрузка данных

In [2]:
df = pd.read_csv('bank-full.csv', delimiter=';')


# Выбор нужных столбцов

In [3]:
columns = ['age', 'job', 'marital', 'education', 'balance', 'housing',
           'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
           'previous', 'poutcome', 'y']
df = df[columns]

# Проверка пропущенных значений

In [4]:
print("Пропущенные значения:")
print(df.isnull().sum())

Пропущенные значения:
age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


# Вопрос 1: Самое частое значение education

In [5]:
print("\nВопрос 1:")
education_mode = df['education'].mode()[0]
print(f"Самое частое значение education: {education_mode}")


Вопрос 1:
Самое частое значение education: secondary


# Вопрос 2: Корреляционная матрица

In [6]:
print("\nВопрос 2:")
numeric_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
correlation_matrix = df[numeric_columns].corr()


Вопрос 2:


# Находим пару с наибольшей корреляцией (исключая диагональ)

In [7]:
max_corr = 0
max_pair = ('', '')
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr = abs(correlation_matrix.iloc[i, j])
        if corr > max_corr:
            max_corr = corr
            max_pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])

print(f"Наибольшая корреляция между: {max_pair}")

Наибольшая корреляция между: ('pdays', 'previous')


# Кодирование целевой переменной

In [8]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Разделение данных

In [9]:
X = df.drop('y', axis=1)
y = df['y']

X_full_train, X_test, y_full_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_full_train, y_full_train, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

print(f"\nРазмеры наборов данных:")
print(f"Тренировочный: {X_train.shape}")
print(f"Валидационный: {X_val.shape}")
print(f"Тестовый: {X_test.shape}")


Размеры наборов данных:
Тренировочный: (27126, 14)
Валидационный: (9042, 14)
Тестовый: (9043, 14)


# Вопрос 3: Mutual Information

In [10]:
print("\nВопрос 3:")
categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']


Вопрос 3:


# One-hot кодирование для расчета mutual information

In [11]:
X_train_encoded = pd.get_dummies(X_train[categorical_columns])
mi_scores = mutual_info_classif(X_train_encoded, y_train, random_state=42)
mi_df = pd.DataFrame({'feature': X_train_encoded.columns, 'mi_score': mi_scores})
mi_df['mi_score'] = mi_df['mi_score'].round(2)


# Группируем по исходным категориальным признакам

In [12]:
feature_groups = {}
for feature in mi_df['feature']:
    original_feature = feature.split('_')[0]
    if original_feature not in feature_groups:
        feature_groups[original_feature] = []
    feature_groups[original_feature].append(mi_df[mi_df['feature'] == feature]['mi_score'].values[0])

# Берем максимальное значение MI для каждого исходного признака

In [13]:
max_mi_scores = {feature: max(scores) for feature, scores in feature_groups.items()}
max_mi_feature = max(max_mi_scores, key=max_mi_scores.get)

print(f"Признак с наибольшей взаимной информацией: {max_mi_feature}")
print("Все MI scores:", max_mi_scores)


Признак с наибольшей взаимной информацией: poutcome
Все MI scores: {'job': np.float64(0.0), 'marital': np.float64(0.01), 'education': np.float64(0.01), 'housing': np.float64(0.01), 'contact': np.float64(0.01), 'month': np.float64(0.01), 'poutcome': np.float64(0.03)}


# Вопрос 4: Логистическая регрессия

In [14]:
print("\nВопрос 4:")


categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


X_train_prepared = X_train[numerical_columns].copy()
X_val_prepared = X_val[numerical_columns].copy()

for col in categorical_columns:
    dummies_train = pd.get_dummies(X_train[col], prefix=col)
    dummies_val = pd.get_dummies(X_val[col], prefix=col)


    for dummy_col in dummies_train.columns:
        if dummy_col in dummies_val.columns:
            X_train_prepared[dummy_col] = dummies_train[dummy_col]
            X_val_prepared[dummy_col] = dummies_val[dummy_col]
        else:
            X_train_prepared[dummy_col] = dummies_train[dummy_col]
            X_val_prepared[dummy_col] = 0


model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_prepared, y_train)


y_val_pred = model.predict(X_val_prepared)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Точность на валидационном наборе: {accuracy:.2f}")


Вопрос 4:
Точность на валидационном наборе: 0.90


# Вопрос 5: Feature Elimination

In [15]:
print("\nВопрос 5:")

base_accuracy = accuracy
features_to_test = ['age', 'balance', 'marital', 'previous']
accuracy_differences = {}

for feature in features_to_test:
    if feature in categorical_columns:

        cols_to_drop = [col for col in X_train_prepared.columns if col.startswith(feature + '_')]
        X_train_reduced = X_train_prepared.drop(cols_to_drop, axis=1)
        X_val_reduced = X_val_prepared.drop(cols_to_drop, axis=1)
    else:

        X_train_reduced = X_train_prepared.drop(feature, axis=1)
        X_val_reduced = X_val_prepared.drop(feature, axis=1)


    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)


    y_val_pred_reduced = model_reduced.predict(X_val_reduced)
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)


    difference = base_accuracy - accuracy_reduced
    accuracy_differences[feature] = difference
    print(f"Без {feature}: точность = {accuracy_reduced:.3f}, разница = {difference:.3f}")


min_diff_feature = min(accuracy_differences, key=lambda x: abs(accuracy_differences[x]))
print(f"Признак с наименьшей разницей: {min_diff_feature}")



Вопрос 5:
Без age: точность = 0.901, разница = 0.000
Без balance: точность = 0.901, разница = 0.000
Без marital: точность = 0.901, разница = 0.000
Без previous: точность = 0.901, разница = 0.001
Признак с наименьшей разницей: age


# Вопрос 6: Регуляризация

In [16]:
print("\nВопрос 6:")

C_values = [0.01, 0.1, 1, 10]
best_accuracy = 0
best_C = None

for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_prepared, y_train)

    y_val_pred_reg = model_reg.predict(X_val_prepared)
    accuracy_reg = accuracy_score(y_val, y_val_pred_reg)

    print(f"C = {C}: точность = {accuracy_reg:.3f}")

    if accuracy_reg > best_accuracy:
        best_accuracy = accuracy_reg
        best_C = C

print(f"Лучшее значение C: {best_C}")



Вопрос 6:
C = 0.01: точность = 0.898
C = 0.1: точность = 0.901
C = 1: точность = 0.901
C = 10: точность = 0.901
Лучшее значение C: 0.1


# Ответы:
### 1. Ответ: secondary
### 2. Ответ: pdays и previous
### 3. Ответ: poutcome
### 4. Oтвет: 0.9
### 5. Ответ: age
### 6. Ответ: 0.1