In [21]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2

In [34]:
def load_and_preprocess_data(file_path):
    # Загрузка данных
    data = pd.read_csv(file_path)
    
    # Печать текущих столбцов для диагностики
    print("Текущие столбцы:")
    print(data.columns.tolist())
    
    # Переименование столбцов
    column_mapping = {
        "laufkont": "checking_accounts",
        "laufzeit": "duration",
        "moral": "credit_history",
        "verw": "purpose",
        "hoehe": "amount",
        "sparkont": "savings",
        "beszeit": "employment_duration",
        "rate": "installment_rate",
        "famges": "personal_status_sex",
        "buerge": "other_debtors",
        "wohnzeit": "present_residence",
        "verm": "property",
        "alter": "age",
        "weitkred": "other_installment_plans",
        "wohn": "housing",
        "bishkred": "number_credits",
        "beruf": "job",
        "pers": "people_liable",
        "telef": "telephone",
        "gastarb": "foreign_worker",
        "kredit": "risk"
    }
    
    # Переименование столбцов
    data = data.rename(columns=column_mapping)
    
    # Печать столбцов после переименования для диагностики
    print("Столбцы после переименования:")
    print(data.columns.tolist())
    
    return data

def encode_features(data):
    # Определение категориальных признаков
    categorical_features = [
        "checking_accounts", "credit_history", "purpose", "savings",
        "employment_duration", "installment_rate", "personal_status_sex",
        "other_debtors", "present_residence", "property", "housing",
        "number_credits", "job", "people_liable", "telephone", "foreign_worker"
    ]
    
    # Проверка наличия всех категориальных признаков
    missing_features = [feature for feature in categorical_features if feature not in data.columns]
    if missing_features:
        raise KeyError(f"Missing categorical features: {missing_features}")
    
    # Преобразование категориальных признаков с помощью OneHotEncoder
    ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), categorical_features)], remainder="passthrough")
    X_encoded = ct.fit_transform(data.drop(["risk"], axis=1))
    
    # Преобразование разреженной матрицы в плотную матрицу
    X_encoded = X_encoded.toarray()
    
    # Преобразование целевой переменной с помощью LabelEncoder
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(data["risk"])
    
    return X_encoded, y_encoded

def split_data(X, y, test_size=0.2, random_state=42):
    # Разделение данных на тренировочную и тестовую выборки
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_and_evaluate_model(X_train, X_test, y_train, y_test, model):
    # Обучение модели
    model.fit(X_train, y_train)
    
    # Оценка модели
    accuracy = model.score(X_test, y_test)
    print(f"Accuracy: {accuracy:.4f}")
    
    # Предсказание на тестовых данных
    y_pred = model.predict(X_test)
    
    # Вывод отчета о классификации в виде DataFrame
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    display(report_df)

def feature_selection(X_train, X_test, y_train, k=5):
    # Выбор лучших признаков с помощью SelectKBest и chi2
    selector = SelectKBest(chi2, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    
    return X_train_selected, X_test_selected

# Путь к локальному файлу с данными
file_path = "german_credit_data.csv"

# Загрузка и предобработка данных
data = load_and_preprocess_data(file_path)
# Проверка наличия категориальных признаков
try:
    X_encoded, y_encoded = encode_features(data)
except KeyError as e:
    print(f"Error: {e}")
else:
    # Разделение данных на тренировочную и тестовую выборки
    X_train, X_test, y_train, y_test = split_data(X_encoded, y_encoded)

    # Инициализация модели
    gnb = GaussianNB()

    # Обучение и оценка модели без отбора признаков
    print("Обучение и оценка модели без отбора признаков:")
    train_and_evaluate_model(X_train, X_test, y_train, y_test, gnb)

    # Отбор признаков
    X_train_selected, X_test_selected = feature_selection(X_train, X_test, y_train)

    # Обучение и оценка модели с отбором признаков
    print("Обучение и оценка модели с отбором признаков:")
    train_and_evaluate_model(X_train_selected, X_test_selected, y_train, y_test, gnb)

Текущие столбцы:
['laufkont', 'laufzeit', 'moral', 'verw', 'hoehe', 'sparkont', 'beszeit', 'rate', 'famges', 'buerge', 'wohnzeit', 'verm', 'alter', 'weitkred', 'wohn', 'bishkred', 'beruf', 'pers', 'telef', 'gastarb', 'kredit']
Столбцы после переименования:
['checking_accounts', 'duration', 'credit_history', 'purpose', 'amount', 'savings', 'employment_duration', 'installment_rate', 'personal_status_sex', 'other_debtors', 'present_residence', 'property', 'age', 'other_installment_plans', 'housing', 'number_credits', 'job', 'people_liable', 'telephone', 'foreign_worker', 'risk']
Обучение и оценка модели без отбора признаков:
Accuracy: 0.7000


Unnamed: 0,precision,recall,f1-score,support
0,0.513514,0.612903,0.558824,62.0
1,0.809524,0.73913,0.772727,138.0
accuracy,0.7,0.7,0.7,0.7
macro avg,0.661519,0.676017,0.665775,200.0
weighted avg,0.717761,0.7,0.706417,200.0


Обучение и оценка модели с отбором признаков:
Accuracy: 0.7400


Unnamed: 0,precision,recall,f1-score,support
0,0.589286,0.532258,0.559322,62.0
1,0.798611,0.833333,0.815603,138.0
accuracy,0.74,0.74,0.74,0.74
macro avg,0.693948,0.682796,0.687462,200.0
weighted avg,0.73372,0.74,0.736156,200.0
