# Module 2 Classification

In [6]:
import sklearn as sk
import pandas as pd
from sklearn import datasets

## Step 1: Load Data

In [7]:
# Load Iris dataset
iris = datasets.load_iris()

In [8]:
# iris

In [9]:
X,y = iris.data,iris.target

In [10]:
X.shape

(150, 4)

In [11]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Step 2 Split and Randomize Data

In [12]:
# Split and Randomize the data
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X, y,
                                                 test_size=0.25,
                                                 random_state=33)

In [13]:
X_train.shape

(112, 4)

In [14]:
y_train.shape

(112,)

In [15]:
len(y_train), len(y_test)

(112, 38)

In [16]:
X_train.shape, X_test.shape

((112, 4), (38, 4))

## Step 3 Define the Classifier/Model

In [17]:
# KNN Classifier

# from sklearn import neighbors
# clf = neighbors.KNeighborsClassifier()

# More optimized because only get what you need from the package
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()

## Step 4 Train the Model

In [18]:
clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

## Step 5 Evaluate the Model

In [19]:
clf.score(X_test,y_test)

0.9473684210526315

## Step 6: Save the Model

In [20]:
from sklearn.externals import joblib
joblib.dump(clf, 'mymodel.pkl') 

['mymodel.pkl']

## Step 7: Load the Model & Prediction

In [21]:
clf = joblib.load('mymodel.pkl')
clf

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [22]:
clf.predict(X_test)

array([1, 1, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0, 2, 1, 2, 1, 1, 0, 1, 2, 0, 0,
       2, 0, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1])

In [23]:
import numpy as np

X_new = np.array([[6.7,3.1,4.7,1.5],
                  [1.0,1.1,1.7,0.5]])
clf.predict(X_new)

array([1, 0])

## Ex: Classifiers

In [24]:
credit = pd.read_csv('data/credit.csv')

In [25]:
credit.head()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
0,1,1,18,4,2,1049,1,2,4,2,...,4,2,21,3,1,1,3,1,1,1
1,1,1,9,4,0,2799,1,3,2,3,...,2,1,36,3,1,2,3,2,1,1
2,1,2,12,2,9,841,2,4,2,2,...,4,1,23,3,1,1,2,1,1,1
3,1,1,12,4,0,2122,1,3,3,3,...,2,1,39,3,1,2,2,2,1,2
4,1,1,12,4,0,2171,1,3,4,3,...,4,2,38,1,2,2,2,1,1,2


In [26]:
credit.describe()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.7,2.577,20.903,2.545,2.828,3271.248,2.105,3.384,2.973,2.682,...,2.845,2.358,35.542,2.675,1.928,1.407,2.904,1.155,1.404,1.037
std,0.458487,1.257638,12.058814,1.08312,2.744439,2822.75176,1.580023,1.208306,1.118715,0.70808,...,1.103718,1.050209,11.35267,0.705601,0.530186,0.577654,0.653614,0.362086,0.490943,0.188856
min,0.0,1.0,4.0,0.0,0.0,250.0,1.0,1.0,1.0,1.0,...,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,1.0,12.0,2.0,1.0,1365.5,1.0,3.0,2.0,2.0,...,2.0,1.0,27.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0
50%,1.0,2.0,18.0,2.0,2.0,2319.5,1.0,3.0,3.0,3.0,...,3.0,2.0,33.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0
75%,1.0,4.0,24.0,4.0,3.0,3972.25,3.0,5.0,4.0,3.0,...,4.0,3.0,42.0,3.0,2.0,2.0,3.0,1.0,2.0,1.0
max,1.0,4.0,72.0,4.0,10.0,18424.0,5.0,5.0,4.0,4.0,...,4.0,4.0,75.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0


In [27]:
credit['Creditability'].unique()

array([1, 0])

In [28]:
credit['Creditability'].value_counts()

1    700
0    300
Name: Creditability, dtype: int64

In [29]:
y = credit['Creditability']
X = credit.drop(columns = ['Creditability'])

In [30]:
X_train,X_test,y_train,y_test = train_test_split(X, y,
                                                 test_size=0.3,
                                                 random_state=100)

In [31]:
# KNN Classifier

from sklearn import neighbors 
# clf = neighbors.KNeighborsClassifier()
clf = neighbors.KNeighborsClassifier(n_neighbors = 50)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.7333333333333333

In [32]:
# SVM Classifier

from sklearn import svm 
clf = svm.SVC()
# WARNING: Very slow
# clf = svm.SVC(kernel = 'rbf')
# clf = svm.SVC(kernel = 'linear')
# clf = svm.SVC(kernel = 'poly')
# clf = svm.SVC(kernel = 'sigmoid')
clf.fit(X_train,y_train)
clf.score(X_test,y_test)



0.7266666666666667

In [33]:
# Guassian Navie Bayes Classifer

from sklearn import naive_bayes
clf = naive_bayes.GaussianNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.74

In [34]:
# SGD Classifer

from sklearn import linear_model
clf = linear_model.SGDClassifier()
# clf = linear_model.SGDClassifier(penalty = 'l2')
# clf = linear_model.SGDClassifier(penalty = 'l1')
clf.fit(X_train,y_train)
clf.score(X_test,y_test)



0.7266666666666667

In [35]:
# Decision Tree Classifer

from sklearn import tree
# clf = tree.DecisionTreeClassifier()
clf = tree.DecisionTreeClassifier(min_samples_split = 100)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.75

In [36]:
# clf.predict_proba(X_test)

In [45]:
from sklearn.model_selection import GridSearchCV
parameters = {'min_samples_split':[8, 10], 
              'max_depth':[6,8, 10]}
gridsearcher = GridSearchCV(estimator = tree.DecisionTreeClassifier(), param_grid = parameters, cv=5)
gridsearcher.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'min_samples_split': [8, 10], 'max_depth': [6, 8, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
gridsearcher.best_params_

{'max_depth': 6, 'min_samples_split': 8}

In [47]:
tree_clf = gridsearcher.best_estimator_

In [48]:
tree_clf.predict(X_test)

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1])

In [49]:
gridsearcher.score(X_test,y_test)

0.66

In [50]:
!pip install graphviz



In [51]:
from sklearn import tree
# clf = tree.DecisionTreeClassifier()
tree_clf = tree.DecisionTreeClassifier(min_samples_split = 100)
tree_clf.fit(X_train,y_train)
tree_clf.score(X_test,y_test)

0.75

In [52]:
import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(tree_clf, out_file=None,
                                feature_names = X_train.columns,
                                filled = False, rounded = True)
graph = graphviz.Source(dot_data, format="png")
graph.render('tree')

'tree.png'

## Ex: Ensemble Method

In [60]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# clf = RandomForestClassifier(random_state = 2019)
clf = RandomForestClassifier(n_estimators = 80, random_state = 2019)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.78

In [91]:
clf.feature_importances_

array([0.11100245, 0.09930628, 0.05347698, 0.0614115 , 0.13631374,
       0.04928339, 0.04922782, 0.04261773, 0.03658993, 0.01897287,
       0.03798085, 0.05109467, 0.11055488, 0.02489272, 0.02507968,
       0.02198642, 0.02915012, 0.01349232, 0.02242882, 0.00513683])

In [96]:
importances = pd.DataFrame(data = {'column_name': X_train.columns,
                    'feature_importance': clf.feature_importances_})
importances.sort_values(['feature_importance'], ascending = False)

Unnamed: 0,column_name,feature_importance
4,Credit Amount,0.136314
0,Account Balance,0.111002
12,Age (years),0.110555
1,Duration of Credit (month),0.099306
3,Purpose,0.061412
2,Payment Status of Previous Credit,0.053477
11,Most valuable available asset,0.051095
5,Value Savings/Stocks,0.049283
6,Length of current employment,0.049228
7,Instalment per cent,0.042618


In [97]:
importances['feature_importance'].sum()

0.9999999999999999

In [61]:
# Gradient Boosting Tree Classifier

from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.7666666666666667

## Ex: Save the Model

In [28]:
from sklearn.externals import joblib
joblib.dump(clf, 'credit.pkl') 

['credit.pkl']

## Ex: Load the Model

In [29]:
from sklearn.externals import joblib
clf2 = joblib.load('credit.pkl')

In [30]:
import numpy as np
y_pred = clf2.predict(X_test)

## Ex: Customer Transaction Prediction

In [38]:
transaction = pd.read_csv('data/customer_transaction.csv')

In [39]:
transaction.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_73297,0,7.4516,-1.071,7.6644,6.6644,10.6747,-13.2679,5.9301,21.5816,...,7.0792,6.0545,2.4383,0.3335,15.5773,0.4564,1.6129,7.8048,15.8583,-3.773
1,train_79756,1,14.6469,0.2512,8.5101,6.6862,12.0408,4.6785,6.968,19.3274,...,4.0484,6.4801,3.5978,3.1407,23.9872,1.5292,-6.405,8.117,12.6881,-7.7544
2,train_138043,0,11.9599,-5.8784,7.9067,6.5769,7.4192,-5.8442,4.8585,14.3521,...,-5.3537,7.5983,3.9085,6.3675,21.7338,-0.5874,-6.097,8.5625,20.0341,3.3469
3,train_33678,0,9.0334,0.4932,11.5369,5.6356,10.802,-11.692,4.8708,13.2391,...,5.1401,9.1957,2.434,5.5633,16.7777,1.9633,-3.2447,8.7895,14.6535,2.2478
4,train_133321,0,8.6398,-0.4785,16.9614,2.6484,9.9465,-3.0443,6.1818,12.0435,...,1.6368,4.9795,0.3358,1.0997,21.3309,0.7617,3.2567,8.1391,8.5642,-14.1064


In [40]:
transaction.shape

(2000, 202)

In [41]:
y = transaction['target']
X = transaction.drop(columns = ['ID_code', 'target'])

In [42]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=100)

In [44]:
# KNN Classifier

from sklearn import neighbors
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9083333333333333

In [45]:
# # SVM Classifier

# from sklearn import svm 
# clf = svm.SVC()
# clf.fit(X_train,y_train)
# clf.score(X_test,y_test)



0.91

In [46]:
# # Guassian Navie Bayes Classifer

# from sklearn import naive_bayes
# clf = naive_bayes.GaussianNB()
# clf.fit(X_train,y_train)
# clf.score(X_test,y_test)

0.9066666666666666

In [47]:
# SGD Classifer

from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)



0.9083333333333333

In [48]:
# Decision Tree Classifer

from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.8266666666666667

In [49]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)



0.91

In [50]:
# Gradient Boosting Tree Classifier

from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9066666666666666

## Confusion Matrix

In [51]:
from sklearn.metrics import confusion_matrix
y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"])


array([[2, 0, 0],
       [0, 0, 1],
       [1, 0, 2]])

## Ex: Confusion Matrix

In [59]:
# SVM Classifier

from sklearn import svm 
svm_clf = svm.SVC()
svm_clf.fit(X_train,y_train)
svm_clf.score(X_test,y_test)



0.91

In [98]:
from sklearn.metrics import confusion_matrix

y_pred = svm_clf.predict(X_test)
confusion_matrix(y_test,y_pred)

NameError: name 'svm_clf' is not defined

In [61]:
# KNN Classifier

from sklearn import neighbors
knn_clf = neighbors.KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
knn_clf.score(X_test,y_test)

0.9083333333333333

In [99]:
from sklearn.metrics import confusion_matrix

y_pred = knn_clf.predict(X_test)
confusion_matrix(y_test,y_pred)

NameError: name 'knn_clf' is not defined

In [100]:
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[ 35,  47],
       [ 19, 199]])

## Classification Report

In [63]:
from sklearn.metrics import classification_report
y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
print(classification_report(y_true, y_pred, labels=["ant", "bird", "cat"]))

              precision    recall  f1-score   support

         ant       0.67      1.00      0.80         2
        bird       0.00      0.00      0.00         1
         cat       0.67      0.67      0.67         3

   micro avg       0.67      0.67      0.67         6
   macro avg       0.44      0.56      0.49         6
weighted avg       0.56      0.67      0.60         6



  'precision', 'predicted', average, warn_for)


## Ex: Classification Report

In [103]:
from sklearn.metrics import classification_report
y_pred = svm_clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.65      0.43      0.51        82
           1       0.81      0.91      0.86       218

   micro avg       0.78      0.78      0.78       300
   macro avg       0.73      0.67      0.69       300
weighted avg       0.76      0.78      0.76       300



In [105]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.65      0.43      0.51        82
           1       0.81      0.91      0.86       218

   micro avg       0.78      0.78      0.78       300
   macro avg       0.73      0.67      0.69       300
weighted avg       0.76      0.78      0.76       300



In [107]:
classification_report(y_test,y_pred).split('\n')

['              precision    recall  f1-score   support',
 '',
 '           0       0.65      0.43      0.51        82',
 '           1       0.81      0.91      0.86       218',
 '',
 '   micro avg       0.78      0.78      0.78       300',
 '   macro avg       0.73      0.67      0.69       300',
 'weighted avg       0.76      0.78      0.76       300',
 '']

In [109]:
y_pred = knn_clf.predict(X_test)
print(classification_report(y_test,y_pred))

NameError: name 'knn_clf' is not defined

In [140]:
credit['Credit Amount Binned'] = pd.cut(credit['Credit Amount'], bins = 10, labels = range(0,10))
# credit[['Credit Amount', 'Credit Amount Binned']]

In [141]:
credit.head()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker,Credit Amount binned,Credit Amount Binned
0,1,1,18,4,2,1049,1,2,4,2,...,21,3,1,1,3,1,1,1,"(231.826, 9337.0]",0
1,1,1,9,4,0,2799,1,3,2,3,...,36,3,1,2,3,2,1,1,"(231.826, 9337.0]",1
2,1,2,12,2,9,841,2,4,2,2,...,23,3,1,1,2,1,1,1,"(231.826, 9337.0]",0
3,1,1,12,4,0,2122,1,3,3,3,...,39,3,1,2,2,2,1,2,"(231.826, 9337.0]",1
4,1,1,12,4,0,2171,1,3,4,3,...,38,1,2,2,2,1,1,2,"(231.826, 9337.0]",1


In [156]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
credit_t = encoder.fit_transform(credit.values)
credit_t

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


<1000x1079 sparse matrix of type '<class 'numpy.float64'>'
	with 21000 stored elements in Compressed Sparse Row format>

In [135]:
pd.get_dummies(credit, columns = ['Account Balance'])

Unnamed: 0,Creditability,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,Guarantors,...,Occupation,No of dependents,Telephone,Foreign Worker,Credit Amount binned,Credit Amount Binned,Account Balance_1,Account Balance_2,Account Balance_3,Account Balance_4
0,1,18,4,2,1049,1,2,4,2,1,...,3,1,1,1,"(231.826, 9337.0]",0,1,0,0,0
1,1,9,4,0,2799,1,3,2,3,1,...,3,2,1,1,"(231.826, 9337.0]",1,1,0,0,0
2,1,12,2,9,841,2,4,2,2,1,...,2,1,1,1,"(231.826, 9337.0]",0,0,1,0,0
3,1,12,4,0,2122,1,3,3,3,1,...,2,2,1,2,"(231.826, 9337.0]",1,1,0,0,0
4,1,12,4,0,2171,1,3,4,3,1,...,2,1,1,2,"(231.826, 9337.0]",1,1,0,0,0
5,1,10,4,0,2241,1,2,1,3,1,...,2,2,1,2,"(231.826, 9337.0]",1,1,0,0,0
6,1,8,4,0,3398,1,4,1,3,1,...,2,1,1,2,"(231.826, 9337.0]",1,1,0,0,0
7,1,6,4,0,1361,1,2,2,3,1,...,2,2,1,2,"(231.826, 9337.0]",0,1,0,0,0
8,1,18,4,3,1098,1,1,4,2,1,...,1,1,1,1,"(231.826, 9337.0]",0,0,0,0,1
9,1,24,2,3,3758,3,1,1,2,1,...,1,1,1,1,"(231.826, 9337.0]",1,0,1,0,0


In [115]:
credit['Credit Amount'].describe()

count     1000.00000
mean      3271.24800
std       2822.75176
min        250.00000
25%       1365.50000
50%       2319.50000
75%       3972.25000
max      18424.00000
Name: Credit Amount, dtype: float64

In [129]:
credit.nunique()

Creditability                          2
Account Balance                        4
Duration of Credit (month)            33
Payment Status of Previous Credit      5
Purpose                               10
Credit Amount                        923
Value Savings/Stocks                   5
Length of current employment           5
Instalment per cent                    4
Sex & Marital Status                   4
Guarantors                             3
Duration in Current address            4
Most valuable available asset          4
Age (years)                           53
Concurrent Credits                     3
Type of apartment                      3
No of Credits at this Bank             4
Occupation                             4
No of dependents                       2
Telephone                              2
Foreign Worker                         2
Credit Amount binned                   2
Credit Amount Binned                  10
dtype: int64

## Markdown