In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)
file_path = 'C:/Users/afodor/Downloads/DataSet_Hitelbiralat_joados.csv'
df = pd.read_csv(file_path)
y = df['target']

dummy_cols_list = ['sex', 'marital_status', 'residence_type']
dummy_names = []

for col in dummy_cols_list:
    dummy_df = pd.get_dummies(df.loc[:, col], prefix=col, drop_first=True)
    dummy_names += dummy_df.columns.tolist()
    df = pd.concat([df, dummy_df], axis=1)

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in ['target', 'id_client', 'id_shop', 'education']]

X_cols = numeric_cols + dummy_names

X = df[X_cols]
X


Unnamed: 0,age,quant_dependants,area_code_residencial_phone,payment_day,shop_rank,months_in_residence,months_in_the_job,profession_code,mate_income,quant_banking_accounts,personal_net_income,cod_application_booth,quant_additional_cards_in_the_application,sex_M,marital_status_D,marital_status_O,marital_status_S,marital_status_V,residence_type_C,residence_type_O,residence_type_P
0,44,0,31,12,0,12,48,731,0.0,0,300.0,0,0,False,False,True,False,False,False,False,True
1,18,0,31,20,0,216,12,853,0.0,0,300.0,0,0,False,False,False,True,False,False,False,True
2,22,0,31,8,0,48,12,40,0.0,0,229.0,0,0,False,False,False,False,False,False,False,True
3,47,0,31,25,0,180,24,35,0.0,0,304.0,0,0,False,False,False,False,False,False,False,True
4,28,0,31,25,0,12,12,24,0.0,0,250.0,0,0,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,21,0,31,12,0,120,12,218,0.0,0,234.0,0,0,True,False,False,True,False,False,False,True
49996,40,0,31,1,0,120,72,717,0.0,0,1500.0,0,0,False,False,False,False,False,False,False,True
49997,23,0,31,28,0,264,12,991,0.0,0,240.0,0,0,False,False,False,True,False,False,False,True
49998,38,0,5,28,0,48,204,40,0.0,0,616.0,0,0,False,False,False,False,False,False,False,True


In [2]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
model = LogisticRegression()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [3]:
auc_scores = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    auc_scores.append(auc)


print("AUC értékek:", auc_scores)

mean_auc = np.mean(auc_scores)

max_deviation = max(auc_scores, key=lambda x: abs(x - mean_auc))
deviation_percentage = (mean_auc - max_deviation) / mean_auc * 100

print(f"AUC átlagos értéke: {mean_auc:.4f}")
print(f"A legnagyobb eltérő: {max_deviation:.4f}, ami {deviation_percentage:.4f}% eltérést jelent.")

#Az átlagos AUC érték: 0.6541
#A legjobban eltérő AUC érték: 0.6348
#Maximum 2.9585%-ban tértünk el a 0.6541-es AUC értéktől, nevezetesen lefele, azaz a legnagyobb eltérés a 0.6348 értéknél volt megfigyelhető

AUC értékek: [0.6614207941349282, 0.6592555691673516, 0.6483575948728055, 0.6394899081630043, 0.662250177164279, 0.6568050988373144, 0.6631136084661552, 0.6645873315680367, 0.6512720775366319, 0.6347805900296182]
AUC átlagos értéke: 0.6541
A legnagyobb eltérő: 0.6348, ami 2.9585% eltérést jelent.
