In [1]:
!pip install qpsolvers



In [2]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from qpsolvers import solve_qp
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid 
from imblearn.under_sampling import RandomUnderSampler



In [3]:
# Συνάρτηση με την οποία μετατρέπουμε τα δεδομένα από αλφαριθμητικά σε αριθμητικά
# δέχεται ως όρισμα ένα dataframe με τα δεδομένα
# επιστρέφει το dataframe τροποποιημένο κατάλληλα
def transform_data(data):
  male = ['a','b']
  married = ['u','y','l','t']
  bankCustomer = ['g','p','gg']
  educationLevel = ['c','d','cc','i','j','k','m','r','q','w','x','e','aa','ff']
  ethnicity = ['v','h','bb','j','n','z','dd','ff','o']
  priorDefault = ['t','f']
  employed = ['t','f']
  driverLicense = ['t','f']
  citizen = ['g','p','s']
  approved = ['-','+']
  # για κάθε αλγαριθμητικό θα δώσουμε μια τιμή
  for i,x in data.iterrows():
    data.loc[i,'Male'] = male.index(x['Male'])
    data.loc[i,'Married'] = married.index(x['Married'])
    data.loc[i,'BankCustomer'] = bankCustomer.index(x['BankCustomer'])
    data.loc[i,'EducationLevel'] = educationLevel.index(x['EducationLevel'])
    data.loc[i,'Ethnicity'] = ethnicity.index(x['Ethnicity'])
    data.loc[i,'PriorDefault'] = priorDefault.index(x['PriorDefault'])
    data.loc[i,'Employed'] = employed.index(x['Employed'])
    data.loc[i,'DriversLicense'] = driverLicense.index(x['DriversLicense'])
    data.loc[i,'Citizen'] = citizen.index(x['Citizen'])
    data.loc[i,'Approved'] = 2*(approved.index(x['Approved'])) -1 # κάνουμε αυτό τον διαχωρισμό για να έχουμε τιμές -1 και +1
  data = data.drop(columns=['Key', 'ZipCode']) #διαγράφουμε τις στήλες που δεν χρειαζόμαστε
  return data

In [4]:
# φόρτωμα δεδομένων
data = pd.read_csv('data.csv',delimiter=',')

columns_data = data.columns # στήλες των δεδομένων

# "γέμισμα" άγνωστων τιμών με τις πιο συχνές τιμές που συναντάμε στο αντίστοιχο πεδίο
imp =SimpleImputer(missing_values='?', strategy='most_frequent')
data = pd.DataFrame(imp.fit_transform(data),columns=columns_data)
# μετατροπή δεδομένων σε αριθμητικά
data = transform_data(data)

# διαχωρισμός σε feutures και σε labels
X = data.iloc[:,0:14]
y = data.iloc[:,14]


y=y.astype('int')
X = X.astype('float')

# χωρισμός dataset σε δεδομένα εκαπίδευσης και δεδομένα ελέγχου
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.4)


train_X = np.array(train_X)
test_X = np.array(test_X)
train_y = np.array(train_y)
test_y = np.array(test_y)

In [65]:
# Συνάρτηση που υπολογίζει τον πυρήνα δύο διανυσμάτων
# x,y είναι διανύσματα, kernel_type είναι ο τύπος του πυρήνα 
# και gamma το γ που χρησιμοποιείται για το πυρήνα gauss (μπορεί να παραλειφθεί στους άλλους πυρήνες)
# η συνάρτηση επιστρέφει το αποτέλεσμα  
def kernel(x,y,kernel_type,gamma=None):
  if (kernel_type == 'linear'):
    kernel = np.dot(x.T,y)
  elif (kernel_type == 'poly'):
    kernel = np.square((np.dot(x.T,y)+1))
  elif (kernel_type == 'gaussian'):
    temp = np.square(x - y)
    temp = np.sum(temp)
    kernel = np.exp((-gamma)*temp)
  return kernel

In [66]:
n_samples = train_X.shape[0] # πλήθος δεδομένων εκπαίδευσης
P = np.zeros((n_samples,n_samples)) # ΝΧΝ πίνακας 
kernel_type = 'gaussian' # τύπος πυρήνα
gamma = 10 # γ για τον πυρήνα gauss

# δημιουργία πίνακα P για την λύση του προβλήματος quadratic programming (άνω τριγωνικό)
for i in range(n_samples):
  for j in range(i,n_samples):
   P[i][j] = train_y[i]*train_y[j]*kernel(train_X[i],train_X[j],kernel_type,gamma)

# δημιουργία συμμετρικού πίνακα
P = P + P.T - np.diag(np.diag(P))
# πρόσθεση μίας μικρής τιμής στην διαγώνιο για να εξασφαλίσουμε ότι ο πίνακας θα είναι positive defitive
min = np.amin(np.abs(P))
P = P + np.eye(n_samples)*(min*0.01)

In [67]:
# δημιουργία και των υπόλοιπων πινάκων για την λύση του προβήματος του quadratic programming
c=10
q = np.ones(n_samples)*(-1)
G = np.eye(n_samples) *(-1)
h = np.zeros(n_samples)
A = train_y.reshape((1,n_samples))
b = np.zeros(1)
lb = np.zeros(n_samples)
ub = np.ones(n_samples)*c

# λύση προβλήματος quadratic programming
a = np.array(solve_qp(P,q,G,h,A,b,lb,ub),dtype='float64')

# εύρεση πολλαπλασιαστών Lagrange που αντιστοιχούν σε διανύσματα υποστήριξης
index_a = np.where((a>1e-6) & (a<c))

# εύρεση διανυσματών υποστήριξης
sv_X = train_X[index_a]
sv_y = train_y[index_a]
sv_a = a[index_a]
# πλήθος διανυσμάτων υποστήριξης
nsv = sv_X.shape[0]

print("Number of support vectors: %d" %nsv)

Number of support vectors: 354


In [68]:
# υπολογισμός το bias
b =np.sum(sv_y - np.dot(np.multiply(sv_a,sv_y),kernel(sv_X.T,sv_X.T,kernel_type,gamma)))
b = b / nsv

In [69]:
# προβλεψη τιμών για τα δεδομένα εκπαίδευσης
predicted_train = []
for x in train_X:
  k = []
  for sv in sv_X:
    k.append(kernel(sv,x,kernel_type,gamma))
  w = np.dot(np.multiply(sv_a,sv_y).T,k)
  y = w + b
  predicted_train.append(np.sign(y))
# εμφάνιση απόδοσης
print("Training Accuracy: %f" %(accuracy_score(train_y,predicted_train)))

# προβλεψη τιμών για τα δεδομένα ελέγχου
predicted_test = []
for x in test_X:
  k = []
  for sv in sv_X:
    k.append(kernel(sv,x,kernel_type,gamma))
  w = np.dot(np.multiply(sv_a,sv_y).T,k)
  y = w + b
  predicted_test.append(np.sign(y))
# εμφάνιση απόδοσης
print("Testing Accuracy: %f" %(accuracy_score(test_y,predicted_test)))

Training Accuracy: 1.000000
Testing Accuracy: 0.572034


SVM χρησιμοποιώντας έτοιμες συναρτήσεις

In [5]:
# Γραμμικός πυρήνας
clf = svm.SVC(kernel='linear',C=1)
clf.fit(train_X,train_y)
predicted_test = clf.predict(test_X)
predicted_train = clf.predict(train_X)
print("Linear Kernel - Training Accuracy %f" %(accuracy_score(train_y,predicted_train)))
print("Linear Kernel - Testing Accuracy %f" %(accuracy_score(test_y,predicted_test)))

Linear Kernel - Training Accuracy 0.898305
Linear Kernel - Testing Accuracy 0.822034


In [6]:
# Πολυωνυμικός πυρήνας
clf = svm.SVC(kernel='poly',degree=2,C=1)
clf.fit(train_X,train_y)
predicted_test = clf.predict(test_X)
predicted_train = clf.predict(train_X)
print("Poly Kernel - Training Accuracy %f" %(accuracy_score(train_y,predicted_train)))
print("Poly Kernel - Testing Accuracy %f" %(accuracy_score(test_y,predicted_test)))

Poly Kernel - Training Accuracy 0.646893
Poly Kernel - Testing Accuracy 0.622881


In [7]:
#Πυρήνας Gauss
clf = svm.SVC(kernel='rbf',gamma=10,C=1)
clf.fit(train_X,train_y)
predicted_test = clf.predict(test_X)
predicted_train = clf.predict(train_X)
print("Gauss Kernel - Training Accuracy %f" %(accuracy_score(train_y,predicted_train)))
print("Gauss Kernel - Testing Accuracy %f" %(accuracy_score(test_y,predicted_test)))

Gauss Kernel - Training Accuracy 1.000000
Gauss Kernel - Testing Accuracy 0.588983


Κατηγοριοποιητές Κ - πλησιέστερων γειτόνων και πλησιέστερου κέντρου

In [8]:
model = KNeighborsClassifier(n_neighbors=1,p=2) # 1-ΝΝ
model.fit(train_X, train_y)

predicted_y = [] 
start = time.time()
for i in range(0,len(test_X)):
  y_i = model.predict([test_X[i]])
  predicted_y = np.append(predicted_y,y_i)
end = time.time()
#εμφάνιση αποτελεσμάτων
print("1 - NEAREST NEIGHBOR")
print("Test time: %f sec" %(end-start))
print("Accuracy: %f" %(accuracy_score(test_y,predicted_y))) 

1 - NEAREST NEIGHBOR
Test time: 0.250891 sec
Accuracy: 0.669492


In [9]:
model = KNeighborsClassifier(n_neighbors=3,p=2) # 3-ΝΝ
model.fit(train_X, train_y)

predicted_y = [] 
start = time.time()
for i in range(0,len(test_X)):
  y_i = model.predict([test_X[i]])
  predicted_y = np.append(predicted_y,y_i)
end = time.time()
#εμφάνιση αποτελεσμάτων
print("3 - NEAREST NEIGHBORS")
print("Test time: %f sec" %(end-start))
print("Accuracy: %f" %(accuracy_score(test_y,predicted_y))) 

3 - NEAREST NEIGHBORS
Test time: 0.167686 sec
Accuracy: 0.648305


In [10]:
print("NEAREST CENTROID") 
model = NearestCentroid(metric='euclidean')
start = time.time()
model.fit(train_X,train_y)
end = time.time()
print("Training time: %f sec" %(end-start))

predicted_y = [] 
start = time.time()
for i in range(0,len(test_X)):
  y_i = model.predict([test_X[i]])
  predicted_y = np.append(predicted_y,y_i)
end = time.time()
#εμφάνιση αποτελεσμάτων
print("Test time: %f sec" %(end-start))
print("Accuracy: %f" %(accuracy_score(test_y,predicted_y)))

NEAREST CENTROID
Training time: 0.001156 sec
Test time: 0.058834 sec
Accuracy: 0.673729


Επεξεργασία μεγαλύτερου συνόλου δεδομένων

In [11]:
# Συνάρτηση με την οποία μετατρέπουμε τα δεδομένα από αλφαριθμητικά σε αριθμητικά
# δέχεται ως όρισμα ένα dataframe με τα δεδομένα
# επιστρέφει το dataframe τροποποιημένο κατάλληλα
def transform_data_big(data):
  job = ['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed']
  marital_status = ['divorced', 'married', 'single']
  education = ['Primary_Education', 'Professional_Education', 'Secondary_Education', 'Tertiary_Education']
  ethnicity = ['v','h','bb','j','n','z','dd','ff','o','?']
  default = ['no','yes']
  housing_loan = ['no','yes']
  personal_loan = ['no','yes']
  subscribed = ['no','yes']
  # για κάθε αλγαριθμητικό θα δώσουμε μια τιμή
  for i,x in data.iterrows():
    data.loc[i,'Job'] = job.index(x['Job'])
    data.loc[i,'Marital_Status'] = marital_status.index(x['Marital_Status'])
    data.loc[i,'Education'] = education.index(x['Education'])
    data.loc[i,'Default_Credit'] = default.index(x['Default_Credit'])
    data.loc[i,'Housing_Loan'] = housing_loan.index(x['Housing_Loan'])
    data.loc[i,'Personal_Loan'] = personal_loan.index(x['Personal_Loan'])
    data.loc[i,'Subscribed'] = 2*(subscribed.index(x['Subscribed'])) -1 # κάνουμε αυτό τον διαχωρισμό για να έχουμε τιμές -1 και +1
  return data

In [12]:
# φόρτωμα δεδομένων
big_data = pd.read_csv('Alpha_bank.csv',delimiter=',')

# μετατροπή δεδομένων σε αριθμητικά
big_data = transform_data_big(big_data)

# διαχωρισμός σε feutures και σε labels
X = big_data.iloc[:,0:7]
y = big_data.iloc[:,7]

X = X.astype('float')
y = y.astype('int')


# χωρισμός dataset σε δεδομένα εκαπίδευσης και δεδομένα ελέγχου
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.4)

#κανονικοποίηση δεδομένων
reg = StandardScaler()
reg.fit(train_X)
train_X = reg.transform(train_X)
test_X = reg.transform(test_X)

train_X = np.array(train_X)
test_X = np.array(test_X)
train_y = np.array(train_y)
test_y = np.array(test_y)

SVM χρησιμοποιώντας έτοιμες συναρτήσεις

In [13]:
# Γραμμικός πυρήνας
clf = svm.SVC(kernel='linear',C=10)
train_time = time.time()
clf.fit(train_X,train_y)
train_time = time.time() - train_time

pred_time_train = time.time()
predicted_train = clf.predict(train_X)
pred_time_train = time.time() - pred_time_train

pred_time_test = time.time()
predicted_test = clf.predict(test_X)
pred_time_test = time.time() - pred_time_test

print("Linear Kernel")
print("Training Time: %f sec" %(train_time))
print("Training Accuracy %f predicted in %f sec" %(accuracy_score(train_y,predicted_train),pred_time_train))
print("Testing Accuracy %f predicted in %f sec" %(accuracy_score(test_y,predicted_test),pred_time_test))
print("----------------------------------")

Linear Kernel
Training Time: 3.915600 sec
Training Accuracy 0.874166 predicted in 0.871198 sec
Testing Accuracy 0.872447 predicted in 0.591184 sec
----------------------------------


In [14]:
# Πολυωνυμικός πυρήνας
clf = svm.SVC(kernel='poly',C=10)
train_time = time.time()
clf.fit(train_X,train_y)
train_time = time.time() - train_time

pred_time_train = time.time()
predicted_train = clf.predict(train_X) 
pred_time_train = time.time() - pred_time_train

pred_time_test = time.time()
predicted_test = clf.predict(test_X)
pred_time_test = time.time() - pred_time_test

print("Poly Kernel")
print("Training Time: %f sec" %(train_time))
print("Training Accuracy %f predicted in %f sec" %(accuracy_score(train_y,predicted_train),pred_time_train))
print("Testing Accuracy %f predicted in %f sec" %(accuracy_score(test_y,predicted_test),pred_time_test))
print("----------------------------------")

Poly Kernel
Training Time: 111.899092 sec
Training Accuracy 0.874166 predicted in 1.562600 sec
Testing Accuracy 0.872447 predicted in 1.060674 sec
----------------------------------


In [15]:
clf = svm.SVC(kernel='rbf',C=1,gamma=1)
train_time = time.time()
clf.fit(train_X,train_y)
train_time = time.time() - train_time

pred_time_train = time.time()
predicted_train = clf.predict(train_X)
pred_time_train = time.time() - pred_time_train

pred_time_test = time.time()
predicted_test = clf.predict(test_X)
pred_time_test = time.time() - pred_time_test

print("Gauss Kernel")
print("Training Time: %f sec" %(train_time))
print("Training Accuracy %f predicted in %f sec" %(accuracy_score(train_y,predicted_train),pred_time_train))
print("Testing Accuracy %f predicted in %f sec" %(accuracy_score(test_y,predicted_test),pred_time_test))
print("----------------------------------")

Gauss Kernel
Training Time: 11.282272 sec
Training Accuracy 0.876189 predicted in 3.493668 sec
Testing Accuracy 0.871627 predicted in 2.362088 sec
----------------------------------


Κατηγοριοποιητές Κ - πλησιέστερων γειτόνων και πλησιέστερου κέντρου

In [16]:
model = KNeighborsClassifier(n_neighbors=1,p=2) # 1 - NN
model.fit(train_X, train_y)

predicted_y = [] 
start = time.time()
for i in range(0,len(test_X)):
  y_i = model.predict([test_X[i]])
  predicted_y = np.append(predicted_y,y_i)
end = time.time()
#εμφάνιση αποτελεσμάτων
print("1 - NEAREST NEIGHBOR")
print("Test time: %f sec" %(end-start))
print("Accuracy: %f" %(accuracy_score(test_y,predicted_y))) 

1 - NEAREST NEIGHBOR
Test time: 10.467595 sec
Accuracy: 0.797227


In [17]:
model = KNeighborsClassifier(n_neighbors=3,p=2) # 3 - ΝΝ
model.fit(train_X, train_y)

predicted_y = [] 
start = time.time()
for i in range(0,len(test_X)):
  y_i = model.predict([test_X[i]])
  predicted_y = np.append(predicted_y,y_i)
end = time.time()
#εμφάνιση αποτελεσμάτων
print("3 - NEAREST NEIGHBORS")
print("Test time: %f sec" %(end-start))
print("Accuracy: %f" %(accuracy_score(test_y,predicted_y))) 

3 - NEAREST NEIGHBORS
Test time: 9.499875 sec
Accuracy: 0.842999


In [18]:
print("NEAREST CENTROID")
model = NearestCentroid(metric='euclidean')
start = time.time()
model.fit(train_X,train_y)
end = time.time()
print("Training time: %f sec" %(end-start))

predicted_y = [] #outputs
start = time.time()
for i in range(0,len(test_X)):
  y_i = model.predict([test_X[i]])
  predicted_y = np.append(predicted_y,y_i)
end = time.time()
print("Test time: %f sec" %(end-start))
print("Accuracy: %f" %(accuracy_score(test_y,predicted_y))) #print accuracy

NEAREST CENTROID
Training time: 0.004254 sec
Test time: 2.248772 sec
Accuracy: 0.543270


Επεξεργασία δεδομένων με undersampling

In [19]:
# φόρτωμα δεδομένων
big_data = pd.read_csv('Alpha_bank.csv',delimiter=',')

# μετατροπή δεδομένων σε αριθμητικά
big_data = transform_data_big(big_data)

# διαχωρισμός σε feutures και σε labels
X = big_data.iloc[:,0:7]
y = big_data.iloc[:,7]

X = X.astype('float')
y = y.astype('int')

# udersampling δειγμάτων
rus = RandomUnderSampler()
X,y = rus.fit_resample(X,y)

# χωρισμός dataset σε δεδομένα εκαπίδευσης και δεδομένα ελέγχου
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.4)

# κανονικοποίηση δεδομένων
reg = StandardScaler()
reg.fit(train_X)
train_X = reg.transform(train_X)
test_X = reg.transform(test_X)

train_X = np.array(train_X)
test_X = np.array(test_X)
train_y = np.array(train_y)
test_y = np.array(test_y)



In [20]:
clf = svm.SVC(kernel='linear',C=1) # γραμμικός πυρήνας
train_time = time.time()
clf.fit(train_X,train_y)
train_time = time.time() - train_time

pred_time_train = time.time()
predicted_train = clf.predict(train_X)
pred_time_train = time.time() - pred_time_train

pred_time_test = time.time()
predicted_test = clf.predict(test_X)
pred_time_test = time.time() - pred_time_test

print("Linear Kernel")
print("Training Time: %f sec" %(train_time))
print("Training Accuracy %f predicted in %f sec" %(accuracy_score(train_y,predicted_train),pred_time_train))
print("Testing Accuracy %f predicted in %f sec" %(accuracy_score(test_y,predicted_test),pred_time_test))
print("----------------------------------")

Linear Kernel
Training Time: 0.843998 sec
Training Accuracy 0.552410 predicted in 0.203805 sec
Testing Accuracy 0.547488 predicted in 0.131043 sec
----------------------------------


In [21]:
clf = svm.SVC(kernel='poly',C=1) # πολωνυμικός πυρήνας
train_time = time.time()
clf.fit(train_X,train_y)
train_time = time.time() - train_time

pred_time_train = time.time()
predicted_train = clf.predict(train_X) 
pred_time_train = time.time() - pred_time_train

pred_time_test = time.time()
predicted_test = clf.predict(test_X)
pred_time_test = time.time() - pred_time_test

print("Poly Kernel")
print("Training Time: %f sec" %(train_time))
print("Training Accuracy %f predicted in %f sec" %(accuracy_score(train_y,predicted_train),pred_time_train))
print("Testing Accuracy %f predicted in %f sec" %(accuracy_score(test_y,predicted_test),pred_time_test))
print("----------------------------------")

Poly Kernel
Training Time: 0.710261 sec
Training Accuracy 0.574887 predicted in 0.245987 sec
Testing Accuracy 0.558185 predicted in 0.161396 sec
----------------------------------


In [22]:
clf = svm.SVC(kernel='rbf',C=1,gamma=10) #πυρήνας Gauss
train_time = time.time()
clf.fit(train_X,train_y)
train_time = time.time() - train_time

pred_time_train = time.time()
predicted_train = clf.predict(train_X) 
pred_time_train = time.time() - pred_time_train

pred_time_test = time.time()
predicted_test = clf.predict(test_X)
pred_time_test = time.time() - pred_time_test

print("Gauss Kernel")
print("Training Time: %f sec" %(train_time))
print("Training Accuracy %f predicted in %f sec" %(accuracy_score(train_y,predicted_train),pred_time_train))
print("Testing Accuracy %f predicted in %f sec" %(accuracy_score(test_y,predicted_test),pred_time_test))
print("----------------------------------")

Gauss Kernel
Training Time: 1.109299 sec
Training Accuracy 0.725092 predicted in 0.594000 sec
Testing Accuracy 0.554295 predicted in 0.395113 sec
----------------------------------


Κατηγοριοποιητές Κ - πλησιέστερων γειτόνων και πλησιέστερου κέντρου

In [23]:
model = KNeighborsClassifier(n_neighbors=1,p=2) # 1- NN
model.fit(train_X, train_y)

predicted_y = [] 
start = time.time()
for i in range(0,len(test_X)):
  y_i = model.predict([test_X[i]])
  predicted_y = np.append(predicted_y,y_i)
end = time.time()
#εμφάνιση αποτελεσμάτων
print("1 - NEAREST NEIGHBOR")
print("Test time: %f sec" %(end-start))
print("Accuracy: %f" %(accuracy_score(test_y,predicted_y))) 

1 - NEAREST NEIGHBOR
Test time: 2.142921 sec
Accuracy: 0.525446


In [24]:
model = KNeighborsClassifier(n_neighbors=3,p=2) # 3-ΝΝ
model.fit(train_X, train_y)

predicted_y = [] 
start = time.time()
for i in range(0,len(test_X)):
  y_i = model.predict([test_X[i]])
  predicted_y = np.append(predicted_y,y_i)
end = time.time()
# εμφάνιση αποτελεσμάτων
print("3 - NEAREST NEIGHBORS")
print("Test time: %f sec" %(end-start))
print("Accuracy: %f" %(accuracy_score(test_y,predicted_y))) 

3 - NEAREST NEIGHBORS
Test time: 2.133913 sec
Accuracy: 0.533549


In [25]:
print("NEAREST CENTROID")
model = NearestCentroid(metric='euclidean')
start = time.time()
model.fit(train_X,train_y)
end = time.time()
print("Training time: %f sec" %(end-start))

predicted_y = [] 
start = time.time()
for i in range(0,len(test_X)):
  y_i = model.predict([test_X[i]])
  predicted_y = np.append(predicted_y,y_i)
end = time.time()
#εμφάνιση αποτελεσμάτων
print("Test time: %f sec" %(end-start))
print("Accuracy: %f" %(accuracy_score(test_y,predicted_y)))

NEAREST CENTROID
Training time: 0.001134 sec
Test time: 0.603789 sec
Accuracy: 0.533874
