## Importing Packages ##

In [31]:
import openpyxl
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns


## Importing Data ##

In [32]:
application_details = pd.read_csv('application_record.csv')
credit_record = pd.read_csv('credit_record.csv')

In [33]:
application_details.columns

Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS'],
      dtype='object')

In [35]:
def occupation_by_education_by_gender(occupations, educations, genders, length):
    Gender = []
    Education = []
    Occupation = []
    Percentage = []
    for gender in genders:
        for education in educations:
            for occupation in occupations:
                Gender.append(gender)
                Education.append(education)
                Occupation.append(occupation)
                temp_df_1=application_details.loc[application_details['OCCUPATION_TYPE'] == occupation]
                temp_df_2=temp_df_1.loc[temp_df_1['CODE_GENDER'] == gender]
                temp_df_3=temp_df_2.loc[temp_df_2['NAME_EDUCATION_TYPE'] == education]
                length_filter = len(temp_df_3)
                Percentage.append(length_filter/length)
    percent_dict={
        'Gender': Gender,
        'Education': Education,
        'Occupation': Occupation,
        'Popularity Percentage': Percentage
    }
    df = pd.DataFrame(percent_dict)
    df.to_excel('Occupation_popularity_index.xlsx', index=False)
                
                
        

In [36]:
occupation_by_education_by_gender(list(application_details['OCCUPATION_TYPE'].unique()), list(application_details['NAME_EDUCATION_TYPE'].unique()), list(application_details['CODE_GENDER'].unique()), len(application_details))

In [37]:
occupation_popularity = pd.read_excel('Occupation_popularity_index.xlsx')

### Replacing blank 'OCCUPATION_TYPE' with most popular occupation ###

In [38]:
occupation_popularity['Tag'] = occupation_popularity['Gender']+ '_' + occupation_popularity['Education']

### Creating 'Tag' vs. popularity dictionary

In [39]:
def Dict(data):
    tag_list=list(data['Tag'].unique())
    popular_occupation = []
    data_grouped = data.groupby('Tag')
    for tag in tag_list:
        grouped_data = data_grouped.get_group(tag)
        popular_occupation.append(grouped_data.loc[grouped_data['Popularity Percentage'] == max(list(grouped_data['Popularity Percentage']))]['Occupation'].tolist()[0])
    Tag_dict={
        'Tag': tag_list,
        'Occupation': popular_occupation
    }
    return Tag_dict

In [40]:
dict_df = Dict(occupation_popularity)

In [41]:
pd.DataFrame(dict_df).to_excel('Tag_vs_Occupation.xlsx', index=False)

In [42]:
tag_data = pd.read_excel('Tag_vs_Occupation.xlsx')

In [11]:
tag_data

Unnamed: 0,Tag,Occupation
0,M_Higher education,Managers
1,M_Secondary / secondary special,Laborers
2,M_Incomplete higher,Laborers
3,M_Lower secondary,Laborers
4,M_Academic degree,Core staff
5,F_Higher education,Core staff
6,F_Secondary / secondary special,Laborers
7,F_Incomplete higher,Core staff
8,F_Lower secondary,Laborers
9,F_Academic degree,Managers


In [43]:
tag_g = list(tag_data['Tag'])
occu_g = list(tag_data['Occupation'])
tag_dict = {}
for index in range(len(tag_g)):
    tag_dict[tag_g[index]] = occu_g[index]

In [44]:
tag_dict

{'M_Higher education': 'Managers',
 'M_Secondary / secondary special': 'Laborers',
 'M_Incomplete higher': 'Laborers',
 'M_Lower secondary': 'Laborers',
 'M_Academic degree': 'Core staff',
 'F_Higher education': 'Core staff',
 'F_Secondary / secondary special': 'Laborers',
 'F_Incomplete higher': 'Core staff',
 'F_Lower secondary': 'Laborers',
 'F_Academic degree': 'Managers'}

In [45]:
application_details['Tag'] = application_details['CODE_GENDER']+ '_' + application_details['NAME_EDUCATION_TYPE']

In [46]:
def Replace_nan_occupation(df):
    for key in tag_dict:
        if df['Tag'] == key and pd.isnull(df['OCCUPATION_TYPE']):
            df['OCCUPATION_TYPE'] = tag_dict[key]
    return df

In [47]:
len(application_details)

438557

In [48]:
application_details = application_details.apply(Replace_nan_occupation, axis=1)

In [49]:
application_details.columns

Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'Tag'],
      dtype='object')

## Converting Days format features to readable format ##

In [50]:
application_details['AGE'] = np.ceil(pd.to_timedelta(application_details['DAYS_BIRTH'], unit='D').dt.days / -365.25)

In [51]:
application_details.drop('DAYS_BIRTH', axis=1, inplace=True)

In [52]:
application_details.loc[(application_details['DAYS_EMPLOYED'] > 0), 'DAYS_EMPLOYED'] = 0
application_details['YEARS_EMPLOYED'] = np.ceil(pd.to_timedelta(application_details['DAYS_EMPLOYED'], unit='D').dt.days / -365.25)

In [53]:
application_details.drop(['DAYS_EMPLOYED', 'Tag'], axis=1, inplace=True)

### Final Columns ###

In [54]:
application_details.columns

Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',
       'CNT_FAM_MEMBERS', 'AGE', 'YEARS_EMPLOYED'],
      dtype='object')

## Encoding Categorical Features to Numerical Values ##

#### Apllication_details data ###

In [55]:
def Cat_to_Num(features):
    for feature in features:
        feature_list = list(np.unique(application_details[feature]))
        feature_dict = {}
        for i in range(len(feature_list)):
                       feature_dict[feature_list[i]] = i
        application_details.replace({feature : feature_dict}, inplace=True)
        print(feature, '-->', feature_dict)

In [56]:
categorical_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS', 'OCCUPATION_TYPE', 'NAME_HOUSING_TYPE', 'NAME_EDUCATION_TYPE']
Cat_to_Num(categorical_features)

CODE_GENDER --> {'F': 0, 'M': 1}
FLAG_OWN_CAR --> {'N': 0, 'Y': 1}
FLAG_OWN_REALTY --> {'N': 0, 'Y': 1}
NAME_INCOME_TYPE --> {'Commercial associate': 0, 'Pensioner': 1, 'State servant': 2, 'Student': 3, 'Working': 4}
NAME_FAMILY_STATUS --> {'Civil marriage': 0, 'Married': 1, 'Separated': 2, 'Single / not married': 3, 'Widow': 4}
OCCUPATION_TYPE --> {'Accountants': 0, 'Cleaning staff': 1, 'Cooking staff': 2, 'Core staff': 3, 'Drivers': 4, 'HR staff': 5, 'High skill tech staff': 6, 'IT staff': 7, 'Laborers': 8, 'Low-skill Laborers': 9, 'Managers': 10, 'Medicine staff': 11, 'Private service staff': 12, 'Realty agents': 13, 'Sales staff': 14, 'Secretaries': 15, 'Security staff': 16, 'Waiters/barmen staff': 17}
NAME_HOUSING_TYPE --> {'Co-op apartment': 0, 'House / apartment': 1, 'Municipal apartment': 2, 'Office apartment': 3, 'Rented apartment': 4, 'With parents': 5}
NAME_EDUCATION_TYPE --> {'Academic degree': 0, 'Higher education': 1, 'Incomplete higher': 2, 'Lower secondary': 3, 'Seconda

In [28]:
application_details.head(10)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE,YEARS_EMPLOYED
0,5008804,1,1,1,0,427500.0,4,1,0,4,1,1,0,0,10,2.0,33.0,13.0
1,5008805,1,1,1,0,427500.0,4,1,0,4,1,1,0,0,10,2.0,33.0,13.0
2,5008806,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0
3,5008808,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0
4,5008809,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0
5,5008810,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0
6,5008811,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0
7,5008812,0,0,1,0,283500.0,1,1,2,1,1,0,0,0,3,1.0,62.0,-0.0
8,5008813,0,0,1,0,283500.0,1,1,2,1,1,0,0,0,3,1.0,62.0,-0.0
9,5008814,0,0,1,0,283500.0,1,1,2,1,1,0,0,0,3,1.0,62.0,-0.0


### Credit_record data ###

In [57]:
convert_to = {'C' : 'Good_Debt', 'X' : 'Good_Debt', '0' : 'Good_Debt', '1' : 'Neutral_Debt', '2' : 'Neutral_Debt', '3' : 'Bad_Debt', '4' : 'Bad_Debt', '5' : 'Bad_Debt'}
credit_record.replace({'STATUS' : convert_to}, inplace=True)

In [27]:
credit_record

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,Good_Debt
1,5001711,-1,Good_Debt
2,5001711,-2,Good_Debt
3,5001711,-3,Good_Debt
4,5001712,0,Good_Debt
...,...,...,...
1048570,5150487,-25,Good_Debt
1048571,5150487,-26,Good_Debt
1048572,5150487,-27,Good_Debt
1048573,5150487,-28,Good_Debt


### Counting the number of debts ###

In [58]:
credit_record = credit_record.value_counts(subset=['ID', 'STATUS']).unstack(fill_value=0)

In [41]:
credit_record

STATUS,Bad_Debt,Good_Debt,Neutral_Debt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5001711,0,4,0
5001712,0,19,0
5001713,0,22,0
5001714,0,15,0
5001715,0,60,0
...,...,...,...
5150482,0,18,0
5150483,0,18,0
5150484,0,13,0
5150485,0,2,0


## Creating Credit Approval Status ##

In [59]:
credit_record.loc[(credit_record['Good_Debt'] > credit_record['Neutral_Debt']), 'CREDIT_APPROVAL_STATUS'] = 1
credit_record.loc[(credit_record['Good_Debt'] > credit_record['Bad_Debt']), 'CREDIT_APPROVAL_STATUS'] = 1
credit_record.loc[(credit_record['Neutral_Debt'] > credit_record['Good_Debt']), 'CREDIT_APPROVAL_STATUS'] = 0
credit_record.loc[(credit_record['Neutral_Debt'] > credit_record['Bad_Debt']), 'CREDIT_APPROVAL_STATUS'] = 1
credit_record.loc[(credit_record['Bad_Debt'] > credit_record['Good_Debt']), 'CREDIT_APPROVAL_STATUS'] = 0
credit_record.loc[(credit_record['Bad_Debt'] > credit_record['Neutral_Debt']), 'CREDIT_APPROVAL_STATUS'] = 0

In [32]:
credit_record

STATUS,Bad_Debt,Good_Debt,Neutral_Debt,CREDIT_APPROVAL_STATUS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5001711,0,4,0,1.0
5001712,0,19,0,1.0
5001713,0,22,0,1.0
5001714,0,15,0,1.0
5001715,0,60,0,1.0
...,...,...,...,...
5150482,0,18,0,1.0
5150483,0,18,0,1.0
5150484,0,13,0,1.0
5150485,0,2,0,1.0


In [60]:
np.unique(credit_record['CREDIT_APPROVAL_STATUS'])

array([0., 1.])

In [61]:
credit_record['CREDIT_APPROVAL_STATUS'] = credit_record['CREDIT_APPROVAL_STATUS'].astype('int')

In [35]:
credit_record

STATUS,Bad_Debt,Good_Debt,Neutral_Debt,CREDIT_APPROVAL_STATUS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5001711,0,4,0,1
5001712,0,19,0,1
5001713,0,22,0,1
5001714,0,15,0,1
5001715,0,60,0,1
...,...,...,...,...
5150482,0,18,0,1
5150483,0,18,0,1
5150484,0,13,0,1
5150485,0,2,0,1


In [62]:
credit_record.drop(['Bad_Debt', 'Good_Debt', 'Neutral_Debt'], axis=1, inplace=True)

In [48]:
credit_record

STATUS,CREDIT_APPROVAL_STATUS
ID,Unnamed: 1_level_1
5001711,1
5001712,1
5001713,1
5001714,1
5001715,1
...,...
5150482,1
5150483,1
5150484,1
5150485,1


# Merging Both application_details and credit_record data #

In [63]:
Final_Credit_data = application_details.merge(credit_record, how='inner', on=['ID'])

In [64]:
Final_Credit_data.head(10)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE,YEARS_EMPLOYED,CREDIT_APPROVAL_STATUS
0,5008804,1,1,1,0,427500.0,4,1,0,4,1,1,0,0,10,2.0,33.0,13.0,1
1,5008805,1,1,1,0,427500.0,4,1,0,4,1,1,0,0,10,2.0,33.0,13.0,1
2,5008806,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,1
3,5008808,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0,1
4,5008809,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0,1
5,5008810,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0,1
6,5008811,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0,1
7,5008812,0,0,1,0,283500.0,1,1,2,1,1,0,0,0,3,1.0,62.0,-0.0,1
8,5008813,0,0,1,0,283500.0,1,1,2,1,1,0,0,0,3,1.0,62.0,-0.0,1
9,5008814,0,0,1,0,283500.0,1,1,2,1,1,0,0,0,3,1.0,62.0,-0.0,1


In [65]:
Final_Credit_data = Final_Credit_data[Final_Credit_data.CNT_CHILDREN != 19]
Final_Credit_data = Final_Credit_data[Final_Credit_data.CNT_CHILDREN != 7]
Final_Credit_data = Final_Credit_data[Final_Credit_data.CNT_CHILDREN != 14]
credit_approval_data = Final_Credit_data
credit_approval_data.columns

Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',
       'CNT_FAM_MEMBERS', 'AGE', 'YEARS_EMPLOYED', 'CREDIT_APPROVAL_STATUS'],
      dtype='object')

In [66]:
import random 

credit_approval_data['AMT_INCOME_TOTAL'] = ((credit_approval_data['AMT_INCOME_TOTAL']*440)/12)/15
credit_approval_data['PREDICTED_TARGET_MONTH'] = credit_approval_data['CREDIT_APPROVAL_STATUS']*((credit_approval_data['AMT_INCOME_TOTAL']-(60000+30000*credit_approval_data['CNT_CHILDREN']))*0.4)
credit_approval_data = credit_approval_data.astype({'PREDICTED_TARGET_MONTH': int})

credit_approval_data = credit_approval_data.astype({'CNT_CHILDREN': str})
credit_approval_data = pd.get_dummies(credit_approval_data)
#placing the age of oldest child randomly from 0 to 17
credit_approval_data['CNT_CHILDREN_1'] = [x*random.randint(0,17) for x in credit_approval_data['CNT_CHILDREN_1']]
credit_approval_data['CNT_CHILDREN_2'] = [x*random.randint(0,17) for x in credit_approval_data['CNT_CHILDREN_2']]
credit_approval_data['CNT_CHILDREN_3'] = [x*random.randint(0,17) for x in credit_approval_data['CNT_CHILDREN_3']]
credit_approval_data['CNT_CHILDREN_4'] = [x*random.randint(0,17) for x in credit_approval_data['CNT_CHILDREN_4']]
credit_approval_data['CNT_CHILDREN_5'] = [x*random.randint(0,17) for x in credit_approval_data['CNT_CHILDREN_5']]
credit_approval_data

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,...,AGE,YEARS_EMPLOYED,CREDIT_APPROVAL_STATUS,PREDICTED_TARGET_MONTH,CNT_CHILDREN_0,CNT_CHILDREN_1,CNT_CHILDREN_2,CNT_CHILDREN_3,CNT_CHILDREN_4,CNT_CHILDREN_5
0,5008804,1,1,1,1045000.0,4,1,0,4,1,...,33.0,13.0,1,394000,1,0,0,0,0,0
1,5008805,1,1,1,1045000.0,4,1,0,4,1,...,33.0,13.0,1,394000,1,0,0,0,0,0
2,5008806,1,1,1,275000.0,4,4,1,1,1,...,59.0,4.0,1,86000,1,0,0,0,0,0
3,5008808,0,0,1,660000.0,0,4,3,1,1,...,53.0,9.0,1,240000,1,0,0,0,0,0
4,5008809,0,0,1,660000.0,0,4,3,1,1,...,53.0,9.0,1,240000,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,5149828,1,1,1,770000.0,4,4,1,1,1,...,48.0,7.0,0,0,1,0,0,0,0,0
36453,5149834,0,0,1,385000.0,0,1,1,1,1,...,34.0,4.0,0,0,1,0,0,0,0,0
36454,5149838,0,0,1,385000.0,1,1,1,1,1,...,34.0,4.0,1,130000,1,0,0,0,0,0
36455,5150049,0,0,1,693000.0,4,4,1,1,1,...,50.0,2.0,1,253200,1,0,0,0,0,0


# Splitting the credit_approval_data into training and testing sets #

In [67]:
from sklearn.model_selection import train_test_split

In [68]:
X = credit_approval_data.drop(['CREDIT_APPROVAL_STATUS', 'ID', 'PREDICTED_TARGET_MONTH'], axis=1)
y = credit_approval_data['CREDIT_APPROVAL_STATUS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Training Model ##

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics 
from sklearn.metrics import accuracy_score

In [None]:
k_range = list(range(1, 40))
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
plt.figure(figsize=(10,6))
plt.plot(k_range,k_scores,color = 'blue',linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Cross-Validated accuracy vs. K Value')
plt.xlabel('K value for kNN')
plt.ylabel('Cross-Validated Accuracy')
print("Maximum accuracy:-",max(k_scores),"at K =",k_scores.index(max(k_scores)))

In [None]:
knn = KNeighborsClassifier(n_neighbors=31, metric='minkowski', p=2)
score_knn = cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()

In [None]:

DT_model = DecisionTreeClassifier()
DT_model.fit(X_train, y_train)
score_dt = cross_val_score(logmodel, X, y, cv=10, scoring='accuracy').mean()


L_model = LogisticRegression()
L_model.fit(X_train, y_train)
score_log = cross_val_score(logmodel, X, y, cv=10, scoring='accuracy').mean()


nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
score_nb = cross_val_score(logmodel, X, y, cv=10, scoring='accuracy').mean()


rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
score_rf = cross_val_score(logmodel, X, y, cv=10, scoring='accuracy').mean()


svm_mod = SVC(kernel='linear', probability=True)
svm_mod.fit(X_train, y_train)
score_svm = cross_val_score(logmodel, X, y, cv=10, scoring='accuracy').mean()


pred_svm = svm_mod.predict(X_test)
pred_log = L_model.predict(X_test)
pred_nb = nb_model.predict(X_test)
tree_pred = DT_model.predict(X_test)
pred_knn = knn.predict(X_test)
pred_rf = rf_model.predict(X_test)


## Accuracy Score ##

In [None]:
#Precision 
logm_precision = metrics.precision_score(y_test, pred_log, average='micro')
nb_precision = metrics.precision_score(y_test, pred_nb, average='micro')
svc_precision = metrics.precision_score(y_test, pred_svm, average='micro')
knn_precision = metrics.precision_score(y_test, pred_knn, average='micro')
dt_precision = metrics.precision_score(y_test, tree_pred, average='micro')
rf_precision = metrics.precision_score(y_test, pred_rf, average='micro')

#Recall 
logm_recall = metrics.recall_score(y_test, pred_log, average='micro')
nb_recall = metrics.recall_score(y_test, pred_nb, average='micro')
svc_recall = metrics.recall_score(y_test, pred_svm, average='micro')
knn_recall = metrics.recall_score(y_test, pred_knn, average='micro')
dt_recall = metrics.recall_score(y_test, tree_pred, average='micro')
rf_recall = metrics.recall_score(y_test, pred_rf, average='micro')

#F1-measure
logm_f1 = metrics.fbeta_score(y_test, pred_log, beta=1.0, average='micro')
nb_f1 = metrics.fbeta_score(y_test, pred_nb, beta=1.0, average='micro')
svc_f1 = metrics.fbeta_score(y_test, pred_svm, beta=1.0, average='micro')
knn_f1 = metrics.fbeta_score(y_test, pred_knn, beta=1.0, average='micro')
dt_f1 = metrics.fbeta_score(y_test, tree_pred, beta=1.0, average='micro')
rf_f1 = metrics.fbeta_score(y_test, pred_rf, beta=1.0, average='micro')

#compare models accross classification metrics
Model_Comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes', 'Support Vector Machine', 'K-Nearest Neighbor', 
              'Decision Tree', 'Random Forest'],
    'Accuracy': [score_log, score_nb, score_svm, score_knn, score_dt, score_rf],
    'Precision': [logm_precision,nb_precision, svc_precision, knn_precision, dt_precision, rf_precision],
    'Recall': [logm_recall, nb_recall, svc_recall, knn_recall, dt_recall, rf_recall],
    'F1-meas.': [logm_f1, nb_f1, svc_f1, knn_f1, dt_f1, rf_f1]})
Model_Comparison.round(4)

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

conf_mat = confusion_matrix(y_test, y_pred)
ax = sns.heatmap(conf_mat/np.sum(conf_mat), annot=True, fmt='.2%', cmap='Blues')
ax.xaxis.set_ticklabels(['bad', 'good'])
ax.yaxis.set_ticklabels(['bad', 'good'])
plt.show()

## Saving Model ##

In [181]:
import pickle

In [475]:
with open('danetka.pckl', 'wb') as f:
    pickle.dump(DT_model, f)

## Linear Regression

In [477]:
X = credit_approval_data.drop(['PREDICTED_TARGET_MONTH', 'ID', 'CREDIT_APPROVAL_STATUS'], axis=1)
y = credit_approval_data['PREDICTED_TARGET_MONTH']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [479]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()
lin_reg.fit(X_test, y_test)

y_pred_lin = lin_reg.predict(X_test)
lin_reg.score(X_test, y_test)

0.986512033119876

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
r2_score(y_test, y_pred_lin)

In [480]:
file_reg = open('lin_reg.pckl', 'wb')
pickle.dump(lin_reg, file_reg)
file_reg.close()

In [482]:
credit_approval_data = credit_approval_data.astype({'ID': str})
credit_approval_data['ID'] = [str(random.randint(10000, 99999)) + x for x in credit_approval_data['ID']]
to_save = credit_approval_data.drop(['CREDIT_APPROVAL_STATUS', 'PREDICTED_TARGET_MONTH'], axis=1)
to_save.to_csv(r'hackathon_db.csv', index = False)