In [None]:
## Data Download Link "https://drive.google.com/drive/folders/16RQztUqCfJOlbooHqYlJrp6Q7iL65uZB"

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('/content/application_data.csv')

In [None]:
data_description = pd.read_csv('/content/columns_description.csv', encoding='latin1')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.info()

In [None]:
data_columns = data.columns.tolist()

columns = []

with open('column_names.txt', 'w') as f:
    for column in data_columns:
        values = f'{column}: {data[column].isnull().sum()}: {data[column].dtype}'
        columns.append(values)

In [None]:
columns

In [None]:
correlation = []

with open('column_names.txt', 'w') as f:
    for column in data_columns:
        if data[column].dtype == 'int64' or data[column].dtype == 'float64':
            correlation.append(f"{column}: {data[column].corr(data['TARGET'])}")

In [None]:
correlation

In [None]:
data['TARGET'].value_counts().plot(kind='bar')

In [None]:
data['TARGET'].value_counts()

In [None]:
sns.barplot(x=data['NAME_CONTRACT_TYPE'], y=data['TARGET'])

In [None]:
data['CODE_GENDER'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=data['CODE_GENDER'], y=data['TARGET'])

In [None]:
data['FLAG_OWN_CAR'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=data['FLAG_OWN_CAR'], y=data['TARGET'])

In [None]:
data['FLAG_OWN_REALTY'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=data['FLAG_OWN_REALTY'], y=data['TARGET'])

In [None]:
sns.scatterplot(data['AMT_INCOME_TOTAL'])

In [None]:
sns.scatterplot(data['AMT_CREDIT'])

In [None]:
sns.distplot(data['AMT_INCOME_TOTAL'])

In [None]:
data['AMT_INCOME_TOTAL'].mean()

In [None]:
data['AMT_CREDIT'].mean()

In [None]:
sns.scatterplot(data['CNT_CHILDREN'])

In [None]:
data['CNT_CHILDREN'].max()

In [None]:
sns.boxplot(data['CNT_CHILDREN'])

In [None]:
sns.boxplot(data['AMT_ANNUITY'])

In [None]:
data['NAME_EDUCATION_TYPE'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=data['NAME_EDUCATION_TYPE'], y=data['TARGET'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
data['FLAG_MOBIL'].value_counts()

In [None]:
data['FLAG_EMAIL'].value_counts()

In [None]:
sns.barplot(x=data['FLAG_EMAIL'], y=data['TARGET'])

In [None]:
data['OCCUPATION_TYPE'].value_counts()

In [None]:
sns.barplot(x=data['OCCUPATION_TYPE'], y=data['TARGET'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
data['DEF_30_CNT_SOCIAL_CIRCLE'].value_counts().plot(kind='bar')

In [None]:
data['REGION_RATING_CLIENT'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=data['REGION_RATING_CLIENT'], y=data['TARGET'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
data['REG_REGION_NOT_WORK_REGION'].value_counts()

In [None]:
sns.barplot(x=data['REG_REGION_NOT_WORK_REGION'], y=data['TARGET'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
data['DAYS_LAST_PHONE_CHANGE'].value_counts()

In [None]:
data['DAYS_LAST_PHONE_CHANGE'].corr(data['TARGET'])

In [None]:
data = data[['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'AMT_INCOME_TOTAL',
             'AMT_CREDIT', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS',
             'REGION_RATING_CLIENT', 'REG_REGION_NOT_WORK_REGION', 'DAYS_LAST_PHONE_CHANGE']]

In [None]:
data

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
numerical_columns = data.select_dtypes(include='number').columns
sns.heatmap(data[numerical_columns].corr())

In [None]:
data[numerical_columns].corr()

In [None]:
X = data.drop(['TARGET'], axis=1)
y = data['TARGET']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
X_train

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
categorical_ohe = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR']

categorical_ohe_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

In [None]:
categorical_oe = ['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE']

categorical_oe_transformer = Pipeline(steps=[
    ('ohe', OrdinalEncoder())
])

In [None]:
numerical = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'DAYS_LAST_PHONE_CHANGE']

numerical_ss = Pipeline(steps=[
    ('ss', StandardScaler())
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical_ohe_transformer', categorical_ohe_transformer, categorical_ohe),
        ('categorical_oe_transformer', categorical_oe_transformer, categorical_oe),
        ('numerical_ss', numerical_ss, numerical)
])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred_test = pipeline.predict(X_test)

print("Accuracy score for test data is ", accuracy_score(y_test, y_pred_test))
print("Recall score of our Model is ", recall_score(y_test, y_pred_test))
print("Precision Score of our Model is ", precision_score(y_test, y_pred_test))

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(X_train, y_train)

In [None]:
y_pred_test = pipeline.predict(X_test)

print("Accuracy score for test data is ", accuracy_score(y_test, y_pred_test))
print("Recall score of our Model is ", recall_score(y_test, y_pred_test))
print("Precision Score of our Model is ", precision_score(y_test, y_pred_test))

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

pipeline.fit(X_train, y_train)

In [None]:
y_pred_test = pipeline.predict(X_test)

print("Accuracy score for test data is ", accuracy_score(y_test, y_pred_test))
print("Recall score of our Model is ", recall_score(y_test, y_pred_test))
print("Precision Score of our Model is ", precision_score(y_test, y_pred_test))