# Importing 

In [None]:
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,confusion_matrix


In [None]:
app = pd.read_csv(r'application_record.csv')

rec = pd.read_csv(r'credit_record.csv')

In [None]:
def data_info(data):
    cols = []
    unique = []
    n_uniques = []
    dtypes = []
    nulls = []
    for col in data.columns:
        cols.append(col)
        dtypes.append(data[col].dtype)
        n_uniques.append(data[col].nunique())
        unique.append(data[col].unique())
        nulls.append(data[col].isna().sum())
        
    return pd.DataFrame({'Col' : cols , 'n_uniques' : n_uniques , 
                         'unique' :unique , 'dtypes' : dtypes , "NULLS" : nulls 
                        })

In [None]:
data_info(app)

In [None]:
data_info(rec)

In [None]:
app.info()

In [None]:
app.head()

In [None]:
rec.head()

# 2.Data Preprocessing 

#### Check How Many times Customer ID become 1 or 0 

In [None]:
rec['STATUS'].unique()

In [None]:
rec['STATUS'].replace(['C' , 'X','0','1','2','3','4','5' ],[1,1,1,0,0,0,0,0], inplace=True)

final_result = rec.value_counts(subset=['ID' , 'STATUS']).unstack(fill_value=0)
final_result


In [None]:
final_result['target'] = None 
final_result['target'][final_result[0] > 0] = 0

final_result['target'].unique()

In [None]:
final_result['target'].unique()

In [None]:
final_result['target'].fillna(1,inplace=True)

In [None]:
final_result

In [None]:
new_target = pd.DataFrame(final_result['target'].astype(int))

In [None]:
data = app.merge(new_target , how = 'inner' , on = 'ID')

In [None]:
data

### Check Missing Values


In [None]:
data.isna().sum()

In [None]:
data.fillna('other_type' , inplace=True)

In [None]:
data.set_index('ID' , inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.info()

In [None]:
data.reset_index('ID' , inplace=True)

### 2.1Feature Generation  


###### 2.1.1.1 Total income for Person 

In [None]:

income_person= []
for i in data['AMT_INCOME_TOTAL'] :
    
    print(i)
    for j in data['CNT_FAM_MEMBERS']:
        
        print(j)
        
        
        z = i/j
        
        income_person.append(z)
        break


In [None]:
len(income_person)

In [None]:

income_per = pd.DataFrame(income_person , columns=['Person_income']) 
income_per.set_index(data['ID'] , inplace=True)

income_per

###### 2.1.1.2 How many year & Months our Customer work ?


In [None]:
month_employe = []
year_employe= []
for i in data['DAYS_EMPLOYED']:
    z = i/30
    month_employe.append(z)
    y = round((z / 12),2)
    year_employe.append(y)
len(year_employe)

In [None]:
employee_month = pd.DataFrame( month_employe,columns=['employee_Month'] ).abs()
employee_year = pd.DataFrame(year_employe , columns=['employee_year']).abs()

employee_year.set_index(data['ID'] , inplace=True)
employee_month.set_index(data['ID'] , inplace=True)

In [None]:
employee_month , employee_year

###### 2.1.1.3 Age of the customer 


In [None]:
age = []
for i in data['DAYS_BIRTH']:
    z = i/30
    y = round(z/12 , 3)
    age.append(y)
    
len(age)

In [None]:
age_ = pd.DataFrame(age , columns=['Age']) 
age_.set_index(data['ID'] , inplace=True)

age_ = age_.agg(abs)
age_

###### Merging new Features in application csv file 


In [None]:
data = data.merge(income_per , how = 'inner' , on = 'ID')
data = data.merge(employee_month , how = 'inner' , on = 'ID')
data = data.merge(employee_year , how = 'inner' , on = 'ID')
data = data.merge(age_ , how = 'inner' , on = 'ID')
data.head()

I found Number of years bigger than 60 year so we will custome that 

In [None]:
sel = data.iloc[data['employee_year'][data['employee_year'] >60]].index

In [None]:
data.drop(sel , axis = 0 , inplace=True)

In [None]:
data.drop(3 , axis = 0 , inplace=True)

In [None]:
data.head()

### 2.1.2 Credit Record Csv file

###### 2.1.2.1 Account Length

In [None]:
account_len=pd.DataFrame(rec.groupby('ID')['MONTHS_BALANCE'].agg(['max']))
account_len=account_len.agg(abs)
                         
account_len

###### 2.1.2.2 Starting Month

In [None]:
account_start=pd.DataFrame(rec.groupby('ID')['MONTHS_BALANCE'].agg(['min']))
account_start=account_start.agg(abs)
account_start

###### 2.1.2.5 How many Months customer pay the loan and not pay the loan ?


In [None]:
rec.value_counts(subset=['ID' , 'MONTHS_BALANCE']).unstack(fill_value=0)

In [None]:
# No. of months pay a loan and no. of months don't pay a loan 

fea_new = rec.groupby('ID').agg(sum)
pay= pd.DataFrame(fea_new['STATUS'] )


In [None]:
pay

In [None]:
all_months = pd.DataFrame(rec.groupby('ID')['MONTHS_BALANCE'].count())
all_months.reset_index('ID' , inplace=True)

In [None]:
not_pay = []
for i in all_months['MONTHS_BALANCE']: 
    for j in pay['STATUS']:
        z = i-j
        not_pay.append(z)
        break


In [None]:
not_pay_ = pd.DataFrame(not_pay , columns=['Notpaying_loan']) 
not_pay_.set_index(all_months['ID'] , inplace=True)
not_pay_

In [None]:
data = data.merge(not_pay_ , how = 'inner' , on = 'ID')
data = data.merge(pay , how = 'inner' , on='ID')
data = data.merge(account_len , how = 'inner' , on='ID')
data = data.merge(account_start , how = 'inner' , on='ID')

In [None]:
data.rename(columns={'STATUS' : 'pay_loan' , 'max' : 'account_len' , 'min':'account_start' } , inplace=True)

In [None]:
data.head()

data.rename(columns = {0:'year_employee'}, inplace = True)


### Check Outliers 

In [None]:
sns.distplot( x =data['AMT_INCOME_TOTAL']);

In [None]:
sns.boxplot(data['AMT_INCOME_TOTAL']);

In [None]:
# By Applying IQR theory because the distribution is SKewed 

q1 = data['AMT_INCOME_TOTAL'].quantile(0.25)
q3 = data['AMT_INCOME_TOTAL'].quantile(0.75)
iqr = q3 - q1 

upper_whisker = q3 + 1.5* iqr
lower_whisker = q1 - 1.5* iqr
if lower_whisker < 0 :
    lower_whisker = 0 
upper_whisker , lower_whisker

In [None]:
filt2 = data['AMT_INCOME_TOTAL'] >  upper_whisker 
filt3 = data['AMT_INCOME_TOTAL'] <  lower_whisker

out2 = data[filt2].index
out3 = data[filt3].index
data.drop(out3 , axis = 0 , inplace =True)

In [None]:
data.info()

In [None]:
data.set_index('ID' , inplace=True)

In [None]:
data['target'].value_counts()

## Feature Scaling  

In [None]:
scl = StandardScaler()
data['AMT_INCOME_TOTAL'] = scl.fit_transform(np.array(data['AMT_INCOME_TOTAL']).reshape(-1, 1))
data['CNT_CHILDREN'] = scl.fit_transform(np.array(data['CNT_CHILDREN']).reshape(-1, 1))
data['DAYS_BIRTH'] = scl.fit_transform(np.array(data['DAYS_BIRTH']).reshape(-1, 1))
data['DAYS_EMPLOYED'] = scl.fit_transform(np.array(data['DAYS_EMPLOYED']).reshape(-1, 1))
data['CNT_FAM_MEMBERS'] = scl.fit_transform(np.array(data['CNT_FAM_MEMBERS']).reshape(-1, 1))
data['Person_income'] = scl.fit_transform(np.array(data['Person_income']).reshape(-1, 1))

data['employee_Month'] = scl.fit_transform(np.array(data['employee_Month']).reshape(-1, 1))
data['employee_year'] = scl.fit_transform(np.array(data['employee_year']).reshape(-1, 1))

data['Age'] = scl.fit_transform(np.array(data['Age']).reshape(-1, 1))
data['Notpaying_loan'] = scl.fit_transform(np.array(data['Notpaying_loan']).reshape(-1, 1))
data['pay_loan'] = scl.fit_transform(np.array(data['pay_loan']).reshape(-1, 1))

data['account_len'] = scl.fit_transform(np.array(data['account_len']).reshape(-1, 1))



## Encoding 

#### One Hot Encoding

In [None]:
data = pd.get_dummies(data,columns=['CODE_GENDER' , 'FLAG_OWN_CAR'
                                    , 'FLAG_OWN_REALTY' ])

#### Label Encoding

In [None]:

lb = LabelEncoder()
col = [ 'NAME_INCOME_TYPE' , 'NAME_EDUCATION_TYPE' 
       , 'NAME_FAMILY_STATUS' , 'NAME_HOUSING_TYPE' ,'OCCUPATION_TYPE' ] 
for i in col:

    data[i] = lb.fit_transform(data[i] )

# Splitting Data 

In [None]:

x = data.drop('target' , axis =1 )
y = data['target']

In [None]:
x.info()

## Feature Selection 

for Numerical Data we will use ANOVA as feature Selection

In [None]:
## from sklearn.feature_selection import SelectKBest

select = SelectKBest(f_classif , k = 15)

select_up = select.fit_transform(x,y)
select_feat = select.get_support()

p_value = np.round(select.pvalues_,4)
f_value = np.round(select.scores_,4)

select_inde = select.get_support(indices=True)
select_inde


In [None]:
x = data.iloc[:,select_inde]
y = data['target']


print('Selected Features : \n\n' , x.columns)

x = data.drop('target' , axis =1)
y = data['target']

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.3,random_state=42 ,
                                                       shuffle=True , stratify=data['target'])

from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_res, y_res = smt.fit_resample(x_train, y_train)

In [None]:
tl = SMOTE()
X_res, y_res = tl.fit_resample(x_train, y_train)

from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X_res_2, y_res_2 = tl.fit_resample(X_res, y_res)

In [None]:
model = []
pre_train  = []
rec_train  = []
f1_train  = []
spe_train  = []
pre_test  = []
rec_test  = []
f1_test  = []
spe_test  = []


# KNN

In [None]:
knn5 = KNeighborsClassifier(n_neighbors = 6)
knn5.fit(X_res, y_res)

In [None]:

y_pred = knn5.predict(X_res)

print(confusion_matrix(y_pred, y_res))

tn, fp, fn, tp = confusion_matrix(y_res, y_pred).ravel()

specificity_tra = round(tn / (tn+fp) , 4)
acc_tra=round(accuracy_score(y_pred,y_res),4)
rec_tra = round(recall_score(y_pred,y_res),4)
pre_tra= round(precision_score(y_pred,y_res),4)
f1_tra=round(f1_score(y_pred,y_res) , 4)
print("accuracy_score : " , acc_tra)
print("recall_score : " ,rec_tra)
print("precision_score : ",pre_tra)
print("f1_score : " ,f1_tra)
print("specificity :", specificity_tra)
model.append('KNN')
pre_train.append(pre_tra)
rec_train.append(rec_tra)
f1_train.append(f1_tra)
spe_train.append(specificity_tra)



In [None]:

y_pred = knn5.predict(x_test)

print(confusion_matrix(y_pred,y_test))

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

specificity_tes = round(tn / (tn+fp) , 4)
acc_tes=round(accuracy_score(y_pred,y_test) , 4)
rec_tes = round(recall_score(y_pred,y_test),4)
pre_tes= round(precision_score(y_pred,y_test) ,4)
f1_tes=round(f1_score(y_pred,y_test) , 4 )
print("accuracy_score : " , acc_tes)
print("recall_score : " ,rec_tes)
print("precision_score : ",pre_tes)
print("f1_score : " ,f1_tes)
print("specificity :", specificity_tes)

pre_test.append(pre_tes)
rec_test.append(rec_tes)
f1_test.append(f1_tes)
spe_test.append(specificity_tes)

In [None]:
history = {'precision_score':[pre_train,pre_test] , 
          'recall_score' : [rec_train,rec_test] , 'f1_score' : [f1_train,f1_test]
          , 'specificity' : [spe_train,spe_test]}

classification_report = pd.DataFrame(history , index=['Train' , 'Test'])

In [None]:
classification_report