In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## <span style="color:#21996fd1;">**1- Initial Exploration  & EDA**
---

In [None]:
df=pd.read_csv("application_data.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
null_percent = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

for i in range(0, len(null_percent), 25):
    print(f"\nðŸ“Š Columns {i+1} to {min(i+25, len(null_percent))}:\n")
    print(null_percent.iloc[i:i+25])

## <span style="color:#21996fd1;"> **2- Data Cleaning**
---

In [None]:
df = df.drop(columns=null_percent.head(25).index)

In [None]:
null_percent = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

for i in range(0, len(null_percent), 25):
    print(f"\nðŸ“Š Columns {i+1} to {min(i+25, len(null_percent))}:\n")
    print(null_percent.iloc[i:i+25])

##### <span style="color:#a8dbc9d1;">Remove second 24 columns with more than 50% missing values

In [None]:
df = df.drop(columns=null_percent.head(24).index)

In [None]:
df['OCCUPATION_TYPE'].value_counts(normalize=True) * 100

In [None]:
df['EXT_SOURCE_3'].value_counts(normalize=True) * 100

In [None]:
df['EXT_SOURCE_2'].value_counts(normalize=True) * 100

In [None]:
df['TARGET'].value_counts(normalize=True)

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].value_counts(normalize=True) * 100

In [None]:
df['AMT_REQ_CREDIT_BUREAU_QRT'].value_counts(normalize=True) * 100

In [None]:
df['AMT_REQ_CREDIT_BUREAU_MON'].value_counts(normalize=True) * 100

In [None]:
df['AMT_REQ_CREDIT_BUREAU_WEEK'].value_counts(normalize=True) * 100

In [None]:
df['AMT_REQ_CREDIT_BUREAU_DAY'].value_counts(normalize=True)*100

In [None]:
df['AMT_REQ_CREDIT_BUREAU_HOUR'].value_counts(normalize=True)*100

##### <span style="color:#a8dbc9d1;">Drop Columns That has more than 90% of the same value

In [None]:

df.drop([
    'AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_DAY',
    'AMT_REQ_CREDIT_BUREAU_WEEK',
    'AMT_REQ_CREDIT_BUREAU_MON',
    'AMT_REQ_CREDIT_BUREAU_QRT',
    'NAME_TYPE_SUITE'
], axis=1, inplace=True)

In [None]:
null_percent = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

for i in range(0, len(null_percent), 25):
    print(f"\nðŸ“Š Columns {i+1} to {min(i+25, len(null_percent))}:\n")
    print(null_percent.iloc[i:i+25])


In [None]:
df['CNT_FAM_MEMBERS'].value_counts(normalize=True)*100

In [None]:
df['DAYS_LAST_PHONE_CHANGE'].value_counts(normalize=True)*100

##### <span style="color:#a8dbc9d1;">split 'phone_change_category' into 3 categories

In [None]:
def phone_change_category(days):
    if days >= -30:
        return 'recent_change'
    elif days >= -365:
        return 'within_1_year'
    else:
        return 'long_time_stable'

df['PHONE_CHANGE_STATUS'] = df['DAYS_LAST_PHONE_CHANGE'].apply(phone_change_category)

In [None]:
df['PHONE_CHANGE_STATUS'].value_counts(normalize=True) * 100

In [None]:
df.drop(columns=['DAYS_LAST_PHONE_CHANGE'], inplace=True)

In [None]:
df['FLAG_DOCUMENT_3'].value_counts(normalize=True) * 100

In [None]:
docs_to_drop = [
    'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
    'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
    'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
    'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
    'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'
]

df.drop(columns=docs_to_drop, inplace=True)

In [None]:
null_percent = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

for i in range(0, len(null_percent), 25):
    print(f"\nðŸ“Š Columns {i+1} to {min(i+25, len(null_percent))}:\n")
    print(null_percent.iloc[i:i+25])

In [None]:
df['ORGANIZATION_TYPE'].value_counts(normalize=True)

In [None]:
# number of population in the region where the client lives,
df['REGION_POPULATION_RELATIVE'].value_counts(normalize=True)

In [None]:
# evaluate the bank or financial institution's rating of the quality or "level" of the area where the client lives.
df['REGION_RATING_CLIENT'].value_counts(normalize=True)

In [None]:
df['REGION_RATING_CLIENT_W_CITY'].value_counts(normalize=True)

In [None]:
# does the client officially registered in a region different from the one they actually live in?
df['REG_REGION_NOT_LIVE_REGION'].value_counts(normalize=True)

In [None]:
df['HOUR_APPR_PROCESS_START'].value_counts(normalize=True)

In [None]:
# does the client work in a different region than the one they live in?
df['REG_REGION_NOT_WORK_REGION'].value_counts(normalize=True)

In [None]:
# the comparison is between the officially registered city of the client and the city they actually live in.
df['REG_CITY_NOT_LIVE_CITY'].value_counts(normalize=True)

In [None]:
# does the client live in a different region than the one they work in?
df['LIVE_REGION_NOT_WORK_REGION'].value_counts(normalize=True)

In [None]:
df['LIVE_CITY_NOT_WORK_CITY'].value_counts(normalize=True)

In [None]:
columns_to_drop = [
    "REGION_POPULATION_RELATIVE",
    "REGION_RATING_CLIENT",
    "REG_REGION_NOT_LIVE_REGION",
    "HOUR_APPR_PROCESS_START",
    "REG_CITY_NOT_WORK_CITY",
    "REG_CITY_NOT_LIVE_CITY",
    "REG_REGION_NOT_WORK_REGION",
    "LIVE_REGION_NOT_WORK_REGION",
    "LIVE_CITY_NOT_WORK_CITY"
]
df = df.drop(columns=columns_to_drop, errors='ignore')
print(df.columns)

In [None]:
def wealth_category(row):
    if row["FLAG_OWN_CAR"] == 'Y' and row["FLAG_OWN_REALTY"] == 'Y':
        return "High"
    elif row["FLAG_OWN_CAR"] == 'Y' or row["FLAG_OWN_REALTY"] == 'Y':
        return "Medium"
    else:
        return "Low"

df["WEALTH"] = df.apply(wealth_category, axis=1)
df.drop(["FLAG_OWN_CAR", "FLAG_OWN_REALTY"], axis=1, inplace=True)


In [None]:
df['WEALTH'].value_counts(normalize=True)

In [None]:
df["COMMUNICATION"] = (
    df["FLAG_WORK_PHONE"] |
    df["FLAG_EMP_PHONE"] |
    df["FLAG_PHONE"] |
    df["FLAG_CONT_MOBILE"] |
    df["FLAG_EMAIL"] |
    df["FLAG_MOBIL"]
).astype(int)
df.drop([
    "FLAG_WORK_PHONE",
    "FLAG_EMP_PHONE",
    "FLAG_PHONE",
    "FLAG_CONT_MOBILE",
    "FLAG_EMAIL",
    "FLAG_MOBIL"
], axis=1, inplace=True)


In [None]:
df['NAME_HOUSING_TYPE'].value_counts(normalize=True)

In [None]:
df["NAME_HOUSING_TYPE"] = df["NAME_HOUSING_TYPE"].apply(
    lambda x: "House/apartment" if x == "House / apartment" else "Other"
)

In [None]:
df['DAYS_REGISTRATION'].value_counts(normalize=True)

In [None]:
df['DAYS_ID_PUBLISH'].value_counts(normalize=True)

In [None]:
df['YEARS_ID_PUBLISH'] = (-df['DAYS_ID_PUBLISH'] / 365).round(1)
df.drop(columns=['DAYS_ID_PUBLISH'], inplace=True)

In [None]:
df['YEARS_ID_PUBLISH'].value_counts(normalize=True)

In [None]:
df['YEARS_REGISTRATION'] = (-df['DAYS_REGISTRATION'] / 365).round(1)
df.drop(columns=['DAYS_REGISTRATION'], inplace=True)

In [None]:
df['YEARS_REGISTRATION'].value_counts(normalize=True)

##### <span style="color:#a8dbc9d1;">Create STABILITY_YEARS Columns

In [None]:
df['STABILITY_YEARS'] = (df['YEARS_REGISTRATION'] - df['YEARS_ID_PUBLISH']).round(1)
df.drop(columns=['YEARS_REGISTRATION', 'YEARS_ID_PUBLISH'], inplace=True)

In [None]:
def stability_category(x):
    if x <= 0:
        return "Risk"
    elif x <= 10:
        return "Stable"
    else:
        return "Deep_Stable"

df["STABILITY_YEARS"] = df["STABILITY_YEARS"].apply(stability_category)

In [None]:
df['STABILITY_YEARS'].value_counts(normalize=True)

In [None]:
df['DAYS_EMPLOYED'].value_counts(normalize=True)

In [None]:
df['YEARS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, 0)
df['YEARS_EMPLOYED'] = (-df['YEARS_EMPLOYED'] / 365).round(1)
df.drop(columns=['DAYS_EMPLOYED'], inplace=True)

In [None]:
df['AMT_INCOME_TOTAL'].value_counts(normalize=True)

In [None]:
df['DAYS_BIRTH'].value_counts(normalize=True)

In [None]:
df['AGE_YEARS'] = (-df['DAYS_BIRTH'] / 365).round(1)

In [None]:
df.drop(columns=['DAYS_BIRTH'], inplace=True)

In [None]:
df['AGE_YEARS'].value_counts(normalize=True)

In [None]:
df['CODE_GENDER'].value_counts(normalize=True)

In [None]:
df['NAME_CONTRACT_TYPE'].value_counts(normalize=True)

In [None]:
df['NAME_INCOME_TYPE'].value_counts(normalize=True)

In [None]:
df['NAME_INCOME_TYPE'] = df['NAME_INCOME_TYPE'].map(
    lambda x: x if x in ['Working', 'Commercial associate', 'Pensioner'] else 'Others'
)

df['NAME_INCOME_TYPE'].value_counts(normalize=True)


In [None]:
df['CNT_CHILDREN'].value_counts(normalize=True)

In [None]:
null_percent = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

for i in range(0, len(null_percent), 25):
    print(f"\nðŸ“Š Columns {i+1} to {min(i+25, len(null_percent))}:\n")
    print(null_percent.iloc[i:i+25])

In [None]:
df['OCCUPATION_TYPE'].value_counts(normalize=True)

In [None]:
def group_occupation_types(df):
    occupation_groups = {
        'Tech': ['IT staff', 'High skill tech staff'],
        'Service': ['Cooking staff', 'Cleaning staff', 'Private service staff', 'Waiters/barmen staff'],
        'Labor': ['Laborers', 'Low-skill Laborers', 'Drivers'],
        'Admin': ['Core staff', 'Managers', 'Accountants', 'Secretaries', 'HR staff'],
        'Medical': ['Medicine staff', 'Security staff'],
        'Sales': ['Sales staff', 'Realty agents']
    }

    occupation_map = {}
    for new_group, old_list in occupation_groups.items():
        for old_name in old_list:
            occupation_map[old_name] = new_group

    df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].map(occupation_map)

    return df

In [None]:
df = group_occupation_types(df)

In [None]:
df['OCCUPATION_TYPE'].value_counts(normalize=True)

In [None]:
null_percent = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

for i in range(0, len(null_percent), 25):
    print(f"\nðŸ“Š Columns {i+1} to {min(i+25, len(null_percent))}:\n")
    print(null_percent.iloc[i:i+25])

In [None]:
df['DEF_30_CNT_SOCIAL_CIRCLE'].value_counts(normalize=True)

In [None]:
df['SOCIAL_CIRCLE_AVG'] = df[['DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
                              'DEF_60_CNT_SOCIAL_CIRCLE', 'OBS_30_CNT_SOCIAL_CIRCLE']].mean(axis=1)

df.drop(columns=['DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
                 'DEF_60_CNT_SOCIAL_CIRCLE', 'OBS_30_CNT_SOCIAL_CIRCLE'], inplace=True)

In [None]:
df['SOCIAL_CIRCLE_AVG'].head()

In [None]:
df['SOCIAL_CIRCLE_AVG'].head()

### <span style="color:#7dc7add1;">**=> Check Outlires**
---

In [None]:
plt.figure(figsize=(8, 4))
plt.boxplot(df['CNT_CHILDREN'])
plt.title("Boxplot for Outlier Detection")
plt.show()

In [None]:
import matplotlib.pyplot as plt

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

plt.figure(figsize=(12, 6))
df[numeric_cols].boxplot(rot=90)
plt.title("Boxplot for Outlier Detection")
plt.show()


### <span style="color:#7dc7add1;">**=> Handling Nulls**
---

In [None]:
df['SOCIAL_CIRCLE_AVG'] = df['SOCIAL_CIRCLE_AVG'].fillna(df['SOCIAL_CIRCLE_AVG'].mode()[0])

In [None]:
df = df.dropna(subset=['AMT_ANNUITY', 'CNT_FAM_MEMBERS','AMT_GOODS_PRICE'])

In [None]:
df['NAME_EDUCATION_TYPE'].value_counts(normalize=True)

In [None]:
from scipy.stats import chi2_contingency
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix, correction=False)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cat_cols = df.select_dtypes(include='object').columns

corrs = {}
for col in cat_cols:
    if col != 'OCCUPATION_TYPE':
        corrs[col] = cramers_v(df['OCCUPATION_TYPE'], df[col])

corr_series = pd.Series(corrs).sort_values(ascending=False)
print(corr_series)


##### <span style="color:#a8dbc9d1;">Handling OCCUPATION_TYPE column

In [None]:
df['OCCUPATION_TYPE'] = (
    df.groupby(['ORGANIZATION_TYPE'])['OCCUPATION_TYPE']
      .transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else pd.NA))
)

df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].fillna('Unknown')

In [None]:
df['EXT_SOURCE_2'] = df['EXT_SOURCE_2'].fillna(df['EXT_SOURCE_2'].median())

In [None]:
null_percent = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

for i in range(0, len(null_percent), 25):
    print(f"\nðŸ“Š Columns {i+1} to {min(i+25, len(null_percent))}:\n")
    print(null_percent.iloc[i:i+25])

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].value_counts(normalize=True)

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns

corrs = df[num_cols].corr()['EXT_SOURCE_3'].sort_values(ascending=False)

print(corrs)


##### <span style="color:#a8dbc9d1;">Handling EXT_SOURCE_3 column

In [None]:
df['EXT_SOURCE_3'] = (
    df.groupby(['AGE_YEARS'])['EXT_SOURCE_3']
      .transform(lambda x: x.fillna(x.median()))
)

df['EXT_SOURCE_3'] = df['EXT_SOURCE_3'].fillna(df['EXT_SOURCE_3'].median())


In [None]:
from scipy.stats import chi2_contingency
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix, correction=False)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cat_cols = df.select_dtypes(include='object').columns

corrs = {}
for col in cat_cols:
    if col != 'AMT_REQ_CREDIT_BUREAU_YEAR':
        corrs[col] = cramers_v(df['AMT_REQ_CREDIT_BUREAU_YEAR'], df[col])

corr_series = pd.Series(corrs).sort_values(ascending=False)
print(corr_series)
