In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
import math
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from statistics import mean 

from sklearn.svm import SVC # "Support vector classifier"  
from sklearn.feature_selection import SelectKBest, chi2

## Reading the data

In [2]:
df = pd.read_csv('data.txt', delimiter = ' ')
df.head(10)

Unnamed: 0,1,6,4,12,5,5.1,3,4.1,1.1,67,...,0,0.1,1.4,0.2,0.3,1.5,0.4,0.5,1.6,1.7
0,2,48,2,60,1,3,2,2,1,22,...,0,0,1,0,0,1,0,0,1,2
1,4,12,4,21,1,4,3,3,1,49,...,0,0,1,0,0,1,0,1,0,1
2,1,42,2,79,1,4,3,4,2,45,...,0,0,0,0,0,0,0,0,1,1
3,1,24,3,49,1,3,3,4,4,53,...,1,0,1,0,0,0,0,0,1,2
4,4,36,2,91,5,3,3,4,4,35,...,0,0,1,0,0,0,0,1,0,1
5,4,24,2,28,3,5,3,4,2,53,...,0,0,1,0,0,1,0,0,1,1
6,2,36,2,69,1,3,3,2,3,35,...,0,1,1,0,1,0,0,0,0,1
7,4,12,2,31,4,4,1,4,1,61,...,0,0,1,0,0,1,0,1,0,1
8,2,30,4,52,1,1,4,2,3,28,...,1,0,1,0,0,1,0,0,0,2
9,2,12,2,13,1,2,2,1,3,25,...,1,0,1,0,1,0,0,0,1,2


## Description of the German credit dataset.

### 1. Title: German Credit data

### 2. Source Information

Professor Dr. Hans Hofmann  
Institut f"ur Statistik und "Okonometrie  
Universit"at Hamburg  
FB Wirtschaftswissenschaften  
Von-Melle-Park 5    
2000 Hamburg 13 

### 3. Number of Attributes german.numer: 24 (24 numerical)

### 4. Attributes include information about
*	1) Status of existing checking account
*	2) Duration in month
*	3) Credit history
*	4) Purpose
*	5) Credit amount
*	6) Savings account/bonds
*	7) Present employment
*	8) Installment rate in percentage of disposable income
*	9) Personal status 
*	10)Other debtors / guarantors
*	11)Present residence
*	12)Property
*	13)Age
*	14)Other installment plans 
*	15)Housing
*   16)Number of existing credits at this bank
*	17)Job
*	18)Telephone
*	19)Foreign worker


In [3]:
X = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()

In [4]:
X.shape

(799, 24)

In [5]:
y.shape

(799,)

#### training the classifier for ten times and printing the classification averaged accuracy of SVM on the test data over these ten trials averaged by ten trials

In [6]:
accuracy_scores = []
for trial in range(10):
    X_train, X_test, y_train, y_test = tts(X, y)
    clf = SVC(kernel='linear')  
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

print(mean(accuracy_scores))

0.7635000000000001


#### we then try to preprocess the data and restart the same experiment

In [7]:
df.isnull().sum()

1      0
6      0
4      0
12     0
5      0
5.1    0
3      0
4.1    0
1.1    0
67     0
3.1    0
2      0
1.2    0
2.1    0
1.3    0
0      0
0.1    0
1.4    0
0.2    0
0.3    0
1.5    0
0.4    0
0.5    0
1.6    0
1.7    0
dtype: int64

No NaNs exist

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799 entries, 0 to 798
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   1       799 non-null    int64
 1   6       799 non-null    int64
 2   4       799 non-null    int64
 3   12      799 non-null    int64
 4   5       799 non-null    int64
 5   5.1     799 non-null    int64
 6   3       799 non-null    int64
 7   4.1     799 non-null    int64
 8   1.1     799 non-null    int64
 9   67      799 non-null    int64
 10  3.1     799 non-null    int64
 11  2       799 non-null    int64
 12  1.2     799 non-null    int64
 13  2.1     799 non-null    int64
 14  1.3     799 non-null    int64
 15  0       799 non-null    int64
 16  0.1     799 non-null    int64
 17  1.4     799 non-null    int64
 18  0.2     799 non-null    int64
 19  0.3     799 non-null    int64
 20  1.5     799 non-null    int64
 21  0.4     799 non-null    int64
 22  0.5     799 non-null    int64
 23  1.6     799 non

all the data are in ```int64``` datatype

In [9]:
scaler = StandardScaler()    
X_scaled = scaler.fit_transform(X)

scaled the values of the features to a standard scaler

> thus the only difference is the **standard scaling**

#### restarting the same experiment

In [10]:
accuracy_scores = []
for trial in range(10):
    X_train, X_test, y_train, y_test = tts(X_scaled, y)
    clf = SVC(kernel='linear')  
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

print(mean(accuracy_scores))

0.763


> the accuracy didn't change that much