In [36]:
import pandas as pd
import numpy as np
import os
import sklearn
import math
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler

In [58]:
files_path = os.path.join(os.getcwd(), 'datasets')

dataset_files = [os.path.join(files_path, file) for file in os.listdir(files_path)]

features = pd.read_csv(dataset_files[0])
features.head(5)

(6512, 167)


In [4]:
X = features.iloc[:, :-1].values
y = features.iloc[:, -1].values

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [7]:
model = RandomForestClassifier(n_estimators=300, max_features="auto")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

model2 = svm.SVC(gamma='auto', C=100)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

In [8]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred2))

Accuracy: 0.8042977743668457
Accuracy: 0.7735993860322333


In [66]:
model.predict_proba(X_test)

array([[0.96666667, 0.03333333],
       [0.41872222, 0.58127778],
       [0.33777778, 0.66222222],
       ...,
       [0.74666667, 0.25333333],
       [0.47666667, 0.52333333],
       [0.85666667, 0.14333333]])

In [13]:
class AbstractClassifier:
    def __init__(self):
        self.model = None
    def train_model(X_train, y_train):
        pass
    
class RandomForest(AbstractClassifier):
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=300, max_features="auto")
    def train_model(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
class SVM(AbstractClassifier):
    def __init__(self):
        self.model = svm.SVC(gamma='auto', C=100)
    def train_model(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        


In [20]:
class AbstractAppDomain:
    def __init__(self, AbstractClassifier):
        pass
    
    def get_distance(self, test, train, sort=True):
        distance = cdist(test, train, metric='euclidean')
        return distance

In [15]:
estimator = model.estimators_[5]

In [16]:
print(estimator)

DecisionTreeClassifier(max_features='auto', random_state=399976727)


In [73]:
ad = AbstractAppDomain(RandomForest())

scaler = StandardScaler()
train_normalized = scaler.fit_transform(X_train)
test_normalized = scaler.transform(X_test)
distance = ad.get_distance(train_normalized, train_normalized)
    
a_file.close()

distance.sort()
    
a_file.close()


kNN_distance = np.mean(distance[:, :6], axis = 1)
applicable = kNN_distance <= 1

print(applicable)
print(distance[:, :6])

[ True  True  True ... False  True False]
[[ 0.          0.          0.          0.4304106   1.06479812  1.77240324]
 [ 0.          0.53239906  0.53239906  0.86911622  1.3581071   1.60403405]
 [ 0.          0.          0.          0.          1.86074775  1.97324516]
 ...
 [ 0.          8.07760549 12.15063773 12.57191592 12.59539397 12.66153783]
 [ 0.          0.65674519  1.03919034  1.03919034  1.16763236  1.16763236]
 [ 0.          2.98971835  3.69625216  4.22664359  4.30341742  4.40525079]]


In [49]:
model.predict_prob

In [76]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import pandas as pd
from applicability_domain import ApplicabilityDomainDetector

ad = ApplicabilityDomainDetector()
ad.fit(X_train)
print('fit is OK.')

del ad
ad = ApplicabilityDomainDetector()
ad.fit_transform(X_train)

ad.transform(X_test)

support_train = ad.get_support(X_train)
support_test = ad.get_support(X_test)

X_train[support_train]
X_test[support_test]

print('success!')

y_pred3 = model.predict(X_test)

y_pred3[~support_test] = 0

print("Accuracy:",metrics.accuracy_score(y_test, y_pred3))

fit is OK.
[0.54460199 0.81600925 0.63899882 ... 9.67618182 0.8450651  3.27021372]
[ 2.67313122  0.30264438  8.2227843  ... 10.45146336  7.92043524
  4.21304547]
[0.54460199 0.81600925 0.63899882 ... 9.67618182 0.8450651  3.27021372]
[ 2.67313122  0.30264438  8.2227843  ... 10.45146336  7.92043524
  4.21304547]
success!
Accuracy: 0.7881811204911742


In [64]:
print(y_pred3)

[0. 1. 1. ... 0. 1. 0.]


In [67]:
print(support_test)

[ True  True  True ... False  True  True]


In [72]:
print(ad.kNN_train_test_distance_)

[ 2.67313122  0.30264438  8.2227843  ... 10.45146336  7.92043524
  4.21304547]


In [71]:
print(ad.train_normalized_)
print(ad.test_normalized_)

[[ 0.          0.         -0.02263176 ... -0.5510542   1.63592526
   0.        ]
 [ 0.          0.         -0.02263176 ... -0.93634834  0.81445375
   0.        ]
 [ 0.          0.         -0.02263176 ... -0.5510542   1.49901335
   0.        ]
 ...
 [ 0.          0.         -0.02263176 ...  0.21953408  1.49901335
   0.        ]
 [ 0.          0.         -0.02263176 ... -0.93634834  0.403718
   0.        ]
 [ 0.          0.         -0.02263176 ...  0.21953408 -1.37613694
   0.        ]]
[[ 0.          0.         -0.02263176 ... -0.93634834 -0.55466543
   0.        ]
 [ 0.          0.         -0.02263176 ... -0.93634834  1.63592526
   0.        ]
 [ 0.          0.         -0.02263176 ...  0.60482821  0.54062992
   0.        ]
 ...
 [ 0.          0.         -0.02263176 ... -0.93634834 -1.37613694
   0.        ]
 [ 0.          0.         -0.02263176 ...  0.99012235  0.67754184
   0.        ]
 [ 0.          0.         -0.02263176 ... -0.5510542  -1.37613694
   0.        ]]


In [77]:
print(ad.threshold_)

10.066294767550437
