In [None]:
pip install imbalanced-learn


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib 
from scipy.stats.contingency import chi2_contingency
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, precision_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from keras import models
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.models import Sequential 
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.callbacks import EarlyStopping

from imblearn.over_sampling import SMOTE
from collections import Counter




import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format) 

In [None]:
train_df = pd.read_csv("/kaggle/input/credit-score-classification/train.csv")
train_df["is_train"] = True

test_df = pd.read_csv("/kaggle/input/credit-score-classification/test.csv")  
test_df["is_train"] = False

df = pd.concat([train_df, test_df])

In [None]:
df0 = df.copy()

In [None]:
df.duplicated().sum() 

In [None]:
df.info() 

# Feature Extraction  

In [None]:
df = df[df['Payment_Behaviour'] != '!@9#%8']  
df[['Spending_Level', 'Payment_Value']] = df['Payment_Behaviour'].str.split('_', n=1, expand=True) 
df['Payment_Value'] = df['Payment_Value'].str.rsplit('_', n=1, expand=True)[0]  
df['Payment_Value'] = df['Payment_Value'].str.rsplit('_', n=1, expand=True)[0]  # Bu satır aynı kalıyor
df['Payment_Value'] = df['Payment_Value'].str.replace('spent_', '')

In [None]:
def convert_to_months(age_str):
    if pd.isna(age_str):  
        return 0
    parts = age_str.split()
    years = int(parts[0])  
    months = int(parts[3])  
    return years * 12 + months  

df['Credit_History_Age'] = df['Credit_History_Age'].apply(convert_to_months) 
# NaNs have been transformed 0 

In [None]:
drop_columns = ['ID', 'Customer_ID', 'Name', 'SSN', 'Month','Type_of_Loan', 'Payment_Behaviour'] 

In [None]:
df.drop(drop_columns, axis=1, inplace=True) 

# EDA

In [None]:
df.shape

In [None]:
df.info() 

In [None]:
df.head() 

In [None]:
num_cols = ["Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card","Interest_Rate","Num_of_Loan", "Delay_from_due_date","Num_of_Delayed_Payment","Changed_Credit_Limit", "Num_Credit_Inquiries","Outstanding_Debt","Credit_Utilization_Ratio","Total_EMI_per_month","Amount_invested_monthly",	"Monthly_Balance","Credit_History_Age"] 
cat_cols = ["Occupation","Credit_Mix", "Spending_Level","Payment_Value","Payment_of_Min_Amount"]  
target_col = ["Credit_Score"] 

In [None]:
class DataCleaner:
    def __init__(self, dataframe, numeric_columns):
        self.dataframe = dataframe
        self.numeric_columns = numeric_columns

    def clean_numeric_columns(self):
        
        for col in self.numeric_columns:
            
            if self.dataframe[col].dtype == 'object': 
                self.dataframe[col] = self.dataframe[col].str.extract(r'(\d+)')
                self.dataframe[col] = pd.to_numeric(self.dataframe[col], errors='coerce')
                self.dataframe[col] = self.dataframe[col].fillna(self.dataframe[col].median())
            
            else:
                self.dataframe[col] = self.dataframe[col].fillna(self.dataframe[col].median())
       
        return self.dataframe



cleaner = DataCleaner(df, num_cols)
df = cleaner.clean_numeric_columns()


## Numeric Columns Analysis & Cleaning 

In [None]:
def plot_histograms(df, columns, rows=2, cols=3):

    fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 4*rows))
    axes = axes.flatten() 

    for i, col in enumerate(columns):
        if i < len(axes):  
            sns.histplot(df[col], kde=True, ax=axes[i], bins=100)
            axes[i].set_title(col)
            axes[i].set_xlabel('')  

    # Remove unused subplots
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

plot_histograms(df, num_cols, rows=5, cols=4)

In [None]:
df = df[df["Credit_History_Age"] != 0]

In [None]:
corr = df[num_cols].corr() 

In [None]:
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(20, 10))
sns.heatmap(
    corr, 
    annot=True, 
    mask=mask, 
    fmt=".3f", 
    annot_kws={"fontsize": 10},
    cmap="RdBu",  # Kırmızı ve Mavi renk paleti
    vmin=-1,  # Renk skalasının minimum değeri
    vmax=1    # Renk skalasının maksimum değeri
)
plt.title("Correlation Matrix with RdBu Palette")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

def create_boxplots(dataframe, numeric_columns, target_column, figsize=(18, 14), dpi=300):

    df = dataframe.copy()
    df = df.reset_index(drop=True)
    
    n_cols = len(numeric_columns)
    n_rows = (n_cols + 3) // 4  # Calculate number of rows needed (4 plots per row)
    fig = plt.figure(figsize=figsize, dpi=dpi)
    
    matplotlib.rc(("xtick", "ytick", "text"), c="k")
    matplotlib.rc("figure", dpi=80)
    
    for idx, column in enumerate(numeric_columns, 1):
        ax = fig.add_subplot(n_rows, 4, idx)
        
        sns.boxplot(
            data=df,
            x=target_column,
            y=column,
            ax=ax,
            width=0.8,
            palette="Set2"
        )
        
        ax.set_title(column, fontsize=10)
        ax.tick_params(axis='x', rotation=0)
        
    plt.tight_layout(pad=0.3)
    plt.show()
create_boxplots(df, num_cols, 'Credit_Score')     

## Categorical Columns Analysis & Cleaning 

In [None]:
def analyze_categorical_columns(df, categorical_columns, plot=True):
    """
    Analyze categorical columns in a DataFrame.

    Args:
        df (pandas.DataFrame): The input DataFrame.
        categorical_columns (list): List of categorical column names.
        plot (bool, optional): Whether to display plots. Defaults to True.
    """

    for col in categorical_columns:
        print(f"\nAnalysis for column: {col}")
        print("##################################################")
        print(f"Unique values count: {df[col].nunique()}")
        print(f"Unique values: {df[col].unique()}")
        print(f"Value counts:\n{df[col].value_counts()}")

        if plot:
            plt.figure(figsize=(6, 3))
            # Using seaborn's countplot for better visualization
            ax = sns.countplot(y=col, data=df, order=df[col].value_counts().index, palette="coolwarm")

            # Adding the count numbers on top of the bars
            for p in ax.patches:
                ax.annotate(f'{p.get_width():.0f}', (p.get_width() + 5, p.get_y() + p.get_height() / 2),
                            ha='center', va='center')

            plt.title(f'{col} - Category Frequencies')
            plt.show()
        print("##################################################")

analyze_categorical_columns(df, cat_cols, False) 

In [None]:
def remove_underscore_rows(df, column_name):

    filtered_df = df[~df[column_name].str.contains('_', na=False)]
    
    removed_rows = len(df) - len(filtered_df)
    print(f"{column_name} column removed {removed_rows} rows.")
    
    return filtered_df

for col in cat_cols:
    df = remove_underscore_rows(df, col)  

analyze_categorical_columns(df, cat_cols, True)

In [None]:
replace = {"NM": "No"}  
df["Payment_of_Min_Amount"] = df["Payment_of_Min_Amount"].replace(replace)

In [None]:
df["Payment_of_Min_Amount"].value_counts() 

# Feature Selection

## Hypothesis Testing with Chi-Square and F-Tests

This Python code defines a `chi_2_test` function that conducts hypothesis testing to assess the relationship between categorical and numerical columns in a DataFrame and the target variable 'Credit_Score.'

- **Function Purpose**: The `chi_2_test` function first prepares a copy of the DataFrame with only training data and selects categorical and numerical columns of interest. It then initializes an empty NumPy array to store test results. The function performs two types of tests: Chi-Square tests for categorical columns and F-tests (ANOVA) for numerical columns to assess their significance in explaining 'Credit_Score' variations.

- **Hypothesis Testing**: The Chi-Square test evaluates the independence between categorical variables and 'Credit_Score,' while the F-tests assess the variance in 'Credit_Score' explained by numerical variables. The results include the test statistic and p-value for each column, helping identify significant factors influencing 'Credit_Score.'

This function provides valuable insights into how different features relate to 'Credit_Score,' aiding in feature selection and understanding the dataset's predictive power in credit scoring analysis or modeling.


In [None]:
def chi_2_test(df: pd.DataFrame, cat_cols: list, num_cols: list):
    # Assuming that 'Credit_Score' and 'is_train' columns are in the DataFrame.
    df_copy = df.loc[df["is_train"]].copy()
    
    summary = np.empty((len(cat_cols) + len(num_cols), 3), dtype="object")
    y, *_ = df_copy["Credit_Score"].factorize(sort=False)

    def perform_test(cols, test):
        if test == "chi2":
            for i, col in enumerate(cols):
                cross = pd.crosstab(index=df_copy[col], columns=[df_copy["Credit_Score"]])
                t_stat, pvalue, *_ = chi2_contingency(cross)
                summary[i, :] = [col, t_stat, pvalue]

        elif test == "f_classif":
            for i, col in enumerate(cols, start=len(cat_cols)):
                t_stat, pvalue = f_classif(df_copy[[col]], y.reshape(-1, 1))
                summary[i, :] = [col, t_stat[0], pvalue[0]]

    # Perform Chi-square test for categorical columns
    perform_test(cat_cols, test="chi2")
    
    # Perform F-test for numerical columns
    perform_test(num_cols, test="f_classif")

    # Return results as a DataFrame
    return pd.DataFrame(
        data=summary,
        columns=["column", 't-statistic', "p-value"]
    )

# Assuming cat_cols and num_cols are already defined
chi2_summary = chi_2_test(df, cat_cols, num_cols).sort_values(by="t-statistic", ascending=False)


In [None]:
chi2_summary.style.bar("t-statistic").background_gradient(
    "Blues", subset="t-statistic")

In [None]:
fig, ax = plt.subplots(figsize=(7, 5))
sns.barplot(data=chi2_summary, y="column", x="t-statistic", ax=ax)
plt.setp([ax.get_xticklabels(), ax.get_yticklabels()], size=8)
fig.show()

This results show us which features is significant or not for target label (Credit Score) 

In [None]:
insignificant_columns = [
    'Annual_Income', 'Age', 'Num_Credit_Card', 'Monthly_Balance', 'Num_of_Loan', 'Total_EMI_per_month', 'Interest_Rate'
]

# Dropping insignificiant columns
df = df.drop(columns=insignificant_columns)  

In [None]:
df[df["is_train"] == False]["Credit_Score"].isna().sum()

In [None]:
df[df["is_train"] == True]["Credit_Score"].isna().sum()

In [None]:
df[df["is_train"] == True]["Credit_Score"].value_counts() 

In [None]:
df.to_csv("clean_data.csv", index=False)   

In [None]:
df = pd.read_csv("clean_data.csv") 

# Data Preprocess

In [None]:
train_data = df[df['is_train'] == True].drop(columns=['is_train'])
test_data = df[df['is_train'] == False].drop(columns=['is_train'])

X = train_data.drop(columns=['Credit_Score'])
y = train_data['Credit_Score']
X_test = test_data.drop(columns=['Credit_Score'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

In [None]:
for index, class_name in enumerate(label_encoder.classes_):
    print(f"Class '{class_name}' is encoded as {index}")

In [None]:
df["Credit_Score"].value_counts() 

In [None]:
num_cols = X.select_dtypes(exclude=['object']).columns.tolist()
one_hot_cols = ["Occupation", "Payment_of_Min_Amount"]
ordinal_cols = ["Credit_Mix", "Spending_Level", "Payment_Value"]

ordinal_categories = [
    ['Bad', 'Standard', 'Good'],  
    ['Low', 'High'],    
    ['Small', 'Medium', 'Large']
]


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), num_cols),  
        ('one_hot_enc', OneHotEncoder(handle_unknown='ignore'), one_hot_cols),  
        ('ordinal_enc', OrdinalEncoder(categories=ordinal_categories, handle_unknown="use_encoded_value", unknown_value=-1), ordinal_cols)
    ]
)

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X_train = pipeline.fit_transform(X_train)
X_val = pipeline.transform(X_val)
X_test = pipeline.transform(X_test)

# SMOTE  

Since the Keras library doesn't support recall metrics and our data is unbalanced, we will use SMOTE.

In [None]:
from collections import Counter
print("Before:", Counter(y_train_encoded))
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_encoded)
print("After:", Counter(y_train_resampled))

# ANN Model 

In [None]:
from tensorflow.keras.layers import BatchNormalization

In [None]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred_probabilities = model.predict(X_train)
    y_train_pred = y_train_pred_probabilities.argmax(axis=1)
    y_pred_probabilities = model.predict(X_test)
    y_pred = y_pred_probabilities.argmax(axis=1)
    
    print("Test Set:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    print("\nTrain Set:")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [None]:
model = Sequential([
    Dense(512, activation='relu'),
    BatchNormalization(),


    Dense(512, activation='relu'),
    BatchNormalization(),
    
    Dense(256, activation='relu'),
    BatchNormalization(),

    
    Dense(256, activation='relu'),
    BatchNormalization(),

    
    Dense(128, activation='relu'),
    BatchNormalization(),

    
    Dense(64, activation='relu'),
    BatchNormalization(),

    
    Dense(64, activation='relu'),
    BatchNormalization(),


    Dense(32, activation='relu'),
    BatchNormalization(),

    Dense(3, activation='softmax')
])


model.compile(optimizer=Adam(learning_rate=0.001),  
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


early_stopping = EarlyStopping(monitor='val_accuracy',
                               patience=35,
                               restore_best_weights=True)


history = model.fit(x=X_train_resampled,
                    y=y_train_resampled,
                    validation_data=(X_val, y_val_encoded),
                    validation_split=0.2,
                    batch_size=512,
                    epochs=600,
                    verbose=1,
                    callbacks=[early_stopping])   

In [None]:
model.summary()

In [None]:
eval_metric(model, X_train_resampled,y_train_resampled, X_val, y_val_encoded)  

## Without Outliers 

In [None]:
num_cols = df.select_dtypes(exclude=['object']).columns.tolist()
num_cols.remove("is_train")

def outlier_ratio(dataframe, column):
    lower_bound, upper_bound = outlier_bounds(dataframe, column)
    train_mask = dataframe["is_train"] == True  # Sadece train verisini seç
    total_count = dataframe[train_mask].shape[0]
    outlier_count = dataframe[(train_mask) & 
                              ((dataframe[column] < lower_bound) | (dataframe[column] > upper_bound))].shape[0]
    
    return (outlier_count / total_count) * 100 if total_count > 0 else 0

def outlier_bounds(dataframe, column):
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound


def remove_outliers(dataframe, column):
    lower_bound, upper_bound = outlier_bounds(dataframe, column)
    train_mask = dataframe["is_train"] == True  # Sadece train verisini seç
    outlier_indices = dataframe[(train_mask) & 
                                ((dataframe[column] < lower_bound) | (dataframe[column] > upper_bound))].index
    dataframe.drop(outlier_indices, inplace=True)


for col in num_cols:
    remove_outliers(df, col)
    ratio = outlier_ratio(df[df["is_train"] == True], col)  
    print(f"{col} : %{ratio}")  

In [None]:
df.to_csv("without_outliers.csv", index=False)   

In [None]:
df = pd.read_csv("without_outliers.csv")

In [None]:
df[df['is_train'] == True]['Credit_Score'].value_counts()


In [None]:
train_data = df[df['is_train'] == True].drop(columns=['is_train'])
test_data = df[df['is_train'] == False].drop(columns=['is_train'])

X = train_data.drop(columns=['Credit_Score'])
y = train_data['Credit_Score']
X_test = test_data.drop(columns=['Credit_Score'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val) 

num_cols = X.select_dtypes(exclude=['object']).columns.tolist()
one_hot_cols = ["Occupation", "Payment_of_Min_Amount"]
ordinal_cols = ["Credit_Mix", "Spending_Level", "Payment_Value"]

ordinal_categories = [
    ['Bad', 'Standard', 'Good'],  
    ['Low', 'High'],    
    ['Small', 'Medium', 'Large']
] 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), num_cols),  
        ('one_hot_enc', OneHotEncoder(handle_unknown='ignore'), one_hot_cols),  
        ('ordinal_enc', OrdinalEncoder(categories=ordinal_categories, handle_unknown="use_encoded_value", unknown_value=-1), ordinal_cols)
    ]
) 

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X_train = pipeline.fit_transform(X_train)
X_val = pipeline.transform(X_val)
X_test = pipeline.transform(X_test)  

In [None]:
unique_classes = np.unique(y_train)
class_weight_array = compute_class_weight(
    class_weight='balanced',
    classes=unique_classes,
    y=y_train
)
class_weight_dict = {int(i): float(w) for i, w in zip(unique_classes, class_weight_array)}
print("Class weights:", class_weight_dict)


model = Sequential([
    Input(shape=(X_train.shape[1],)),
    
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    
    Dense(3, activation='softmax')
]) 


early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=30,
    restore_best_weights=True
)

model.compile(
    optimizer=Adam(learning_rate=0.001),  
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


history = model.fit(
    x=X_train,
    y=y_train,
    validation_data=(X_val, y_val),
    batch_size=256,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping], 
    class_weight=class_weight_dict
)

In [None]:
eval_metric(model, X_train,y_train, X_val,  y_val) 

In [None]:
pd.DataFrame(history.history).plot(figsize=(10,6))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()