## Init Libraries


In [None]:
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Init Dataset

In [None]:
data = pd.read_csv('../data/german-credit.data')

## Visualizations

### lower credit amounts have better credit results

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
values = []
credit_status_good = data.loc[(data['credit_status'] == 1)]
at1 = credit_status_good['credit_amount'].mean() 
values.append(round(at1,2))

credit_status_bad = data.loc[(data['credit_status'] == 2)]
at2 = credit_status_bad['credit_amount'].mean()
values.append(round(at2,2))

plt.ylabel('Credit Amount')
langs = ['Credit Status Good','Credit Status Bad']
ax.bar(langs,values)
for p in ax.patches:
    ax.annotate(str(p.get_height()),(p.get_x() * 1.005, p.get_height() * 1.005))
plt.title('credit status by credit amount')
plt.show()

### What men and women spend their money for

In [None]:
import matplotlib.pyplot as plt

female_data = data.loc[(data['personal_status'] == 'A92') | (data['personal_status'] == 'A95')]
plt.figure(figsize=(9,9))
plt.title('what women buy')
plt.pie(female_data.purpose.value_counts().tolist(), 
labels=female_data.purpose.value_counts().index.tolist(), 
 autopct='%1.2f%%', textprops={'fontsize':8})
plt.show()

In [None]:
import matplotlib.pyplot as plt

male_data = data.loc[(data['personal_status'] == 'A91') | (data['personal_status'] == 'A93') | (data['personal_status']  == 'A94')]
plt.figure(figsize=(9,9))
plt.title('what men buy')
plt.pie(male_data.purpose.value_counts().tolist(), 
labels=male_data.purpose.value_counts().index.tolist(), 
 autopct='%1.2f%%', textprops={'fontsize':8})
plt.show()

### Effect of Employment Time to Credit Status

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
values = []

plt.ylim(0, 100)
unemployedStatus1=  data.loc[(data['employment_years'] == 'A71') & (data['credit_status'] == 1)].shape[0]
unemployedAll =  data.loc[(data['employment_years'] == 'A71')].shape[0]
rate0 = unemployedStatus1/unemployedAll
values.append(round(rate0*100,2))

lessThan1yearStatus1 = data.loc[(data['employment_years'] == 'A72') & (data['credit_status'] == 1)].shape[0]
lessThan1yearAll= data.loc[(data['employment_years'] == 'A72')].shape[0]
rate1 = lessThan1yearStatus1/lessThan1yearAll
values.append(round(rate1*100,2))


lessThan4yearStatus1 = data.loc[(data['employment_years'] == 'A73') & (data['credit_status'] == 1)].shape[0] #1<years<4
lessThan4yearAll = data.loc[(data['employment_years'] == 'A73')].shape[0]
rate2 = lessThan4yearStatus1/lessThan4yearAll
values.append(round(rate2*100,2))


lessThan7yearStatus1 = data.loc[(data['employment_years'] == 'A74') & (data['credit_status'] == 1)].shape[0] #4<years<7
lessThan7yearAll = data.loc[(data['employment_years'] == 'A74')].shape[0]
rate3 = lessThan7yearStatus1/lessThan7yearAll
values.append(round(rate3*100,2))

moreThan7yearStatus1 = data.loc[(data['employment_years'] == 'A75') & (data['credit_status'] == 1)].shape[0] #years>7
moreThan7yearAll = data.loc[(data['employment_years'] == 'A75')].shape[0]
rate4 = moreThan7yearStatus1/moreThan7yearAll
values.append(round(rate4*100,2))

labels = ['unemployed','<1', '1<y<4', '4<y<7', '>7']
ax.bar(labels,values)
for p in ax.patches:
    ax.annotate(str(p.get_height()) + "%",(p.get_x() * 1.005, p.get_height() * 1.005))
plt.title('credit score by employment time')
plt.show()


### Divorcing has a significant impact on credit status

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.ylim(0, 100)

personal_status_values = []
divorced_male_good = data.loc[(data['personal_status'] == 'A91') & (data['credit_status'] == 1)].shape[0]

divorced_male_total = data.loc[(data['personal_status'] == 'A91')].shape[0]

married_male_good = data.loc[(data['personal_status'] == 'A94') & (data['credit_status'] == 1)].shape[0]

married_male_total = data.loc[(data['personal_status'] == 'A94')].shape[0]

single_male_good = data.loc[(data['personal_status'] == 'A93') & (data['credit_status'] == 1)].shape[0]

single_male_total = data.loc[(data['personal_status'] == 'A93')].shape[0]


divorced_male_percentage = divorced_male_good/divorced_male_total
married_male_percentage = married_male_good/married_male_total
single_male_percentage = single_male_good/single_male_total
personal_status_values.append(round(divorced_male_percentage*100, 2))
personal_status_values.append(round(married_male_percentage*100, 2))
personal_status_values.append(round(single_male_percentage*100, 2))

labels = ['divorced male', 'single male', 'married male']
ax.bar(labels,personal_status_values)

for p in ax.patches:
    ax.annotate(str(p.get_height()) + "%",(p.get_x() * 1.005, p.get_height() * 1.005))

plt.title('credit score percentage by personal status')
plt.show()

### credit status for used cars is better

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.ylim(0, 100)

values = []

new_car_good = data.loc[(data['purpose'] == 'A40') & (data['credit_status'] == 1)].shape[0] 
new_car_total = data.loc[(data['purpose'] == 'A40')].shape[0]
values.append(round(new_car_good/new_car_total*100, 2))

used_car_good = data.loc[(data['purpose'] == 'A41') & (data['credit_status'] == 1)].shape[0] 
used_car_total = data.loc[(data['purpose'] == 'A41')].shape[0]

values.append(round(used_car_good/used_car_total*100, 2))


labels = ['new_car', 'used_car']
ax.bar(labels,values)

for p in ax.patches:
    ax.annotate(str(p.get_height()) + "%",(p.get_x() * 1.005, p.get_height() * 1.005))

plt.title('credit score percentage by car type')
plt.show()

## Prediction

### get dummies

In [None]:
# since sklearn doesn't support classified data, we split classification into columns 

categorized = pd.get_dummies(data, columns=['checking_account_status', 'credit_history', 'purpose', 'savings_account_status', 'employment_years', 'personal_status', 'guarantors', 'property', 'installment_plans', 'housing', 'job', 'telephone', 'foreign_worker'])

### split test and train data

In [None]:
train, test = train_test_split(categorized, test_size=0.2)

attributes = train.loc[:, train.columns != 'credit_status']
credit_status = train.loc[:,['credit_status']]

attributes_test = test.loc[:, train.columns != 'credit_status']
credit_status_test = test.loc[:,['credit_status']]

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# logistic regression is to predict credit status
# since our dataset is  relatively small, the algorithm expects a bigger iteration number
logistic_regression = LogisticRegression(n_jobs=-1, max_iter=20000)
logistic_regression.fit(attributes, credit_status)
pred_logistic = logistic_regression.predict(attributes_test)

log_score = accuracy_score(pred_logistic.round(), credit_status_test)

print("Credit Status Prediction Using Logistic Regression: ", "{:.2f}%".format(log_score * 100))

### linear regression

In [None]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(attributes, credit_status)
pred_linear = linear_regression.predict(attributes_test)

lin_score = accuracy_score(pred_linear.round(), credit_status_test)
print("Credit Status Prediction Using Linear Regression: ", "{:.2f}%".format(lin_score * 100))

### GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB


NBC = GaussianNB()
NBC.fit(attributes, credit_status)
pred_gauss = NBC.predict(attributes_test)

gaus_score = accuracy_score(pred_gauss.round(), credit_status_test)
print("Credit Status Prediction Using GaussianNB: ", "{:.2f}%".format(gaus_score * 100))

### Desicion Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt3 = DecisionTreeClassifier(min_samples_split = 3)
dt3 = dt3.fit(attributes, credit_status)
pred_desicion_tree = dt3.predict(attributes_test)

des_score = accuracy_score(pred_desicion_tree.round(), credit_status_test)
print("Credit Status Prediction Using Desicion Tree Classifier: ", "{:.2f}%".format(des_score * 100))

### MultinominalNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(attributes, credit_status)
pred_MNB = MNB.predict(attributes_test)

mult_score = accuracy_score(pred_gauss.round(), credit_status_test)
print("Credit Status Prediction Using Multinominal: ", "{:.2f}%".format(mult_score * 100))

### combined

In [None]:
pred_combined = []

s = len(pred_gauss)
threshold = 0.7
for i in range(s):
    credit_good = 0
    credit_bad = 0
    if gaus_score > threshold:
        if pred_gauss[i] == 1:
            credit_good += (gaus_score*10)**2
        else:
            credit_bad += (gaus_score*10)**2
    if lin_score > threshold:
        if pred_linear[i] == 1:
            credit_good += (lin_score*10)**2
        else:
            credit_bad += lin_score
    if log_score > threshold:
        if pred_logistic[i] == 1:
            credit_good += (log_score*10)**2
        else:
            credit_bad += (log_score*10)**2
    if mult_score > threshold:
        if pred_MNB[i] == 1:
            credit_good += (mult_score*10)**2
        else:
            credit_bad += (mult_score*10)**2
    
    if credit_good == credit_bad:
        pred_combined.append(pred_logistic[i])
    elif credit_good > credit_bad:
        pred_combined.append(1)
    else:
        pred_combined.append(2)
combined_score = accuracy_score(pred_combined, credit_status_test)
print("Credit Status Prediction Using Multinominal: ", "{:.2f}%".format(combined_score * 100))

In [None]:
import matplotlib.pyplot as plt

plt.xlim(0, 100)
Product = ['Logistic Regression','linear regression','GaussianNB','Desicion Tree Classifier','MultinominalNB', 'combined score']
Quantity = [log_score * 100, lin_score * 100, gaus_score * 100, des_score * 100, mult_score * 100, combined_score * 100]

plt.barh(Product,Quantity)
plt.title('Score of Algorithms')
plt.ylabel('Algorithms')
plt.xlabel('score')

plt.show()