In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split

In [2]:
transformer = joblib.load("../models/pipelines/pipeline1.joblib")

In [3]:
transformer.transform([['male', 46, 2000, 'own']])



array([[0.48214286, 0.09629141, 0.        , 1.        , 0.        ,
        1.        , 0.        ]])

In [4]:
df = pd.read_csv("../data/german_credit_data.csv")

In [5]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [6]:
features = ['Sex', 'Age', 'Credit amount', 'Housing']
y_train = (train['Risk']=='bad').astype('int')
X_train = train[features]
X_train_transformed = transformer.transform(X_train)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

In [8]:
results = cross_validate(LogisticRegression(), X_train_transformed, 
                         y_train, cv=5, 
               return_train_score=True, 
              scoring=["accuracy"])

In [9]:
clf = LogisticRegression()
clf.fit(X_train_transformed, y_train)

LogisticRegression()

In [10]:
clf.predict(transformer.transform([['male', 46, 2000, 'own']]))



array([0])

In [11]:
clf.predict_proba(transformer.transform([['male', 46, 2000, 'own']]))



array([[0.83685017, 0.16314983]])

In [12]:
predict = clf.predict_proba(X_train_transformed)[:, 1]

In [13]:
y_train.values

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,

In [14]:
for i in range(700):
    print(predict[i], y_train.values[i])

0.2648646716263571 0
0.1913494540720529 0
0.259021264764865 0
0.14767456793029066 0
0.25243051620987617 0
0.2572357247078429 1
0.39904072980149397 1
0.3511452833000528 0
0.349820488684946 0
0.22502384442628115 0
0.1801655774326885 1
0.23068052172309803 0
0.16433156845002528 0
0.45611400924154094 1
0.1740897942093792 0
0.22134183097438076 0
0.2726083993221728 1
0.21459707016857893 0
0.2950723883251273 1
0.18688311237193087 1
0.31418949280078795 0
0.30495115825770885 0
0.19322952968931947 1
0.23687251769003173 0
0.3760284398996411 0
0.2119189309683069 0
0.3672200338159065 0
0.15039016494141985 1
0.5060401999801325 1
0.3437101455657978 0
0.1905206083319019 0
0.326374932418314 0
0.3380605261026293 1
0.11896739426593635 0
0.08975009831590873 0
0.18056780233329042 0
0.35296116314196846 1
0.21994579440296816 0
0.2486238560991323 0
0.23949846284879175 0
0.3196776181820414 0
0.18678749509255058 0
0.29080133828362514 1
0.23880056804588984 0
0.23793406500615977 0
0.4969873160947823 0
0.2974515165

In [15]:
print("{} +- {}".format(results['test_accuracy'].mean(),
                        results['test_accuracy'].std()))

0.7014285714285714 +- 0.014568627181693668


In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
results = cross_validate(KNeighborsClassifier(n_neighbors=20),
                         X_train_transformed, 
                         y_train, cv=5, 
               return_train_score=True, 
              scoring=["accuracy"])

print("{} +- {}".format(results['test_accuracy'].mean(),
                        results['test_accuracy'].std()))
                         

0.6928571428571428 +- 0.011952286093343947


In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
results = cross_validate(DecisionTreeClassifier(max_depth=3),
                         X_train_transformed, 
                         y_train, cv=5, 
               return_train_score=True, 
              scoring=["accuracy"])

print("train: ", "{} +- {}".format(results['train_accuracy'].mean(),
                        results['train_accuracy'].std()))
print("validation: ", "{} +- {}".format(results['test_accuracy'].mean(),
                        results['test_accuracy'].std()))
                         

train:  0.7278571428571429 +- 0.005486532677049025
validation:  0.7014285714285714 +- 0.016536909861128886


In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
results = cross_validate(RandomForestClassifier(n_estimators=100,
                                                max_depth=3),
                         X_train_transformed, 
                         y_train, cv=5, 
               return_train_score=True, 
              scoring=["accuracy"])

print("train: ", "{} +- {}".format(results['train_accuracy'].mean(),
                        results['train_accuracy'].std()))
print("validation: ", "{} +- {}".format(results['test_accuracy'].mean(),
                        results['test_accuracy'].std()))
                         

train:  0.7232142857142858 +- 0.0077426726388138654
validation:  0.6985714285714286 +- 0.010497813183356493


In [22]:
from sklearn.svm import SVC

In [24]:
results = cross_validate(SVC(kernel='linear', C=0.5),
                         X_train_transformed, 
                         y_train, cv=5, 
               return_train_score=True, 
              scoring=["accuracy"])

print("train: ", "{} +- {}".format(results['train_accuracy'].mean(),
                        results['train_accuracy'].std()))
print("validation: ", "{} +- {}".format(results['test_accuracy'].mean(),
                        results['test_accuracy'].std()))
                         

train:  0.7014285714285714 +- 0.0007142857142857119
validation:  0.7014285714285714 +- 0.002857142857142891


In [26]:
model = RandomForestClassifier()

In [27]:
model.fit(X_train_transformed, y_train)

RandomForestClassifier()

In [28]:
import joblib

In [31]:
joblib.dump(model, "../models/estimators/model01.joblib")

['../models/estimators/model01.joblib']