In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
!pip install plotly
import plotly.offline as py 
import plotly.graph_objs as go
import plotly.express as px
from collections import Counter  
from subprocess import call
from IPython.display import Image
############################################################################################
%matplotlib inline 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

Defaulting to user installation because normal site-packages is not writeable


In [3]:
credit=pd.read_csv('german_credit_data.csv')
print("The dataset is {} credit record".format(len(credit)))

The dataset is 1000 credit record


## Check data structure

In [4]:
credit.head(2)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad


In [5]:
credit=credit.iloc[:, 1:]

In [6]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.2+ KB


In [7]:
credit.describe()

Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903
std,11.375469,0.653614,2822.736876,12.058814
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


## Descriptive analysis

In [26]:
credit['Sex'].value_counts()

male      690
female    310
Name: Sex, dtype: int64

In [27]:
SA = credit.loc[:,['Sex','Age']]
fig = px.box(SA, x="Sex", y="Age", points="all",color="Sex")
fig.update_layout(
    title={
          'text':"Sex Vs Age Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Sex",
    yaxis_title="Age",
   
)
fig.show()
#import plotly.io as pio
#pio.write_html(fig, file='SA.html', auto_open=True)

In [28]:
SC =credit.loc[:,['Sex','Credit amount']]
fig = px.box(SC, x="Sex", y="Credit amount", points="all", color="Sex")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.update_layout(
    title={
          'text':"Sex Vs Credit Amount Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Sex",
    yaxis_title="Age",
   
)
fig.show()

In [29]:
Purpose = credit['Purpose']
fig = px.histogram(credit, x="Purpose", color="Purpose")
fig.update_layout(
    title={
          'text':"Purpose breakdown",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    }
   
)
fig.show()

In [30]:
SC =credit.loc[:,['Purpose','Credit amount']]
fig = px.box(SC, x="Purpose", y="Credit amount", color="Purpose")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.update_layout(
    title={
          'text':"Purpose Vs Credit Amount Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Purpose",
    yaxis_title="Credit amount",
   
)
fig.show()

In [31]:
import ipywidgets as widgets
# Create dimensions
gender_dim = go.parcats.Dimension(values=credit.Sex, label="Sex")
Housing_dim = go.parcats.Dimension(values=credit.Housing, label="Housing")
#Saving_accounts_dim = go.parcats.Dimension(values=credit['Saving accounts'], label="Saving accounts")
Checking_account_dim = go.parcats.Dimension(values=credit['Checking account'], label="Checking account")
Purpose_dim = go.parcats.Dimension(values=credit.Purpose, label="Purpose")
Risk_dim = go.parcats.Dimension(values=credit.Risk, label="Risk")
# Create parcats trace
color = np.zeros(len(credit), dtype='uint8')

fig = go.Figure(data = [go.Parcats(dimensions=[gender_dim, Housing_dim,Checking_account_dim,Purpose_dim,Risk_dim],
        hoveron='color', hoverinfo='count+probability',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')])

fig.show()

In [32]:
PC =credit.loc[:,['Purpose','Credit amount','Risk']]
fig = go.Figure()

fig.add_trace(go.Violin(x=PC['Purpose'][ PC['Risk'] == 'good' ],
                        y=PC['Credit amount'][  PC['Risk'] == 'good' ],
                        legendgroup='good', scalegroup='good', name='good',
                        side='negative',
                        line_color='blue')
             )
fig.add_trace(go.Violin(x=PC['Purpose'][ PC['Risk'] == 'bad' ],
                        y=PC['Credit amount'][  PC['Risk'] == 'bad' ],
                        legendgroup='bad', scalegroup='bad', name='bad',
                        side='positive',
                        line_color='orange')
             )
fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.update_layout(
    title={
          'text':"Purpose Vs Credit Amount Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Purpose",
    yaxis_title="Credit amount",
   
)
fig.show()

## Transformation of the data

In [8]:
credit['Risk'] = credit['Risk'].map({'bad':1, 'good':0})

In [9]:
credit['Saving accounts'] = credit['Saving accounts'].fillna('Others')
credit['Checking account'] = credit['Checking account'].fillna('Others')

In [10]:
credit_clean=credit.copy()

In [11]:
cat_features = ['Sex','Housing', 'Saving accounts', 'Checking account','Purpose']
num_features=['Age', 'Job', 'Credit amount', 'Duration','Risk']
for variable in cat_features:
    dummies = pd.get_dummies(credit_clean[cat_features])
    df1= pd.concat([credit_clean[num_features], dummies],axis=1)

Risk= df1['Risk']          
df2=df1.drop(['Risk'],axis=1)

In [12]:
X_train,X_test,Y_train,Y_test = train_test_split(df2,Risk,test_size=0.1,random_state = 7)

In [13]:
Y_train.value_counts()/len(Y_train)

0    0.692222
1    0.307778
Name: Risk, dtype: float64

In [14]:
Y_test.value_counts()/len(Y_test)

0    0.77
1    0.23
Name: Risk, dtype: float64

In [15]:
transformer = StandardScaler().fit(X_train)

In [16]:
X_train_prepared = transformer.transform(X_train)
X_test_prepared = transformer.transform(X_test)

In [17]:
X_train.shape

(900, 26)

In [140]:
X_train_prepared

array([[ 0.20518622,  0.12712321, -1.01647163, ...,  1.59477493,
        -0.15829386, -0.11624764],
       [-0.75164072, -2.92383381, -0.89802239, ..., -0.62704773,
        -0.15829386, -0.11624764],
       [ 0.5531233 ,  1.65260172,  1.66444108, ..., -0.62704773,
        -0.15829386, -0.11624764],
       ...,
       [ 0.11820196,  0.12712321,  0.11035926, ..., -0.62704773,
        -0.15829386, -0.11624764],
       [-0.66465646,  0.12712321, -0.66425014, ...,  1.59477493,
        -0.15829386, -0.11624764],
       [ 1.50995024,  1.65260172,  1.45567864, ..., -0.62704773,
        -0.15829386, -0.11624764]])

In [141]:
np.savetxt('german_inputs_train.csv', X_train_prepared, delimiter=",")
np.savetxt('german_outputs_train.csv', Y_train, delimiter=",")
np.savetxt('german_inputs_test.csv', X_test_prepared, delimiter=",")
np.savetxt('german_outputs_test.csv', Y_test, delimiter=",")