In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

In [158]:

###################################

#creating instance of imported StandardScaler
scaler = StandardScaler()  

def SDS(myArrayX, threshold, maxLenOfArrayX, numIterations, numAgents):
        '''
            This function starts the following phases: 
                Initialisation, Test and Diffusion
            Returns: 
                myArrayX -- of type array
                model_ids -- of type array
            
        '''
        #transforming the data
        scaler.fit_transform(myArrayX) 
        
        #for storing IDs
        models=[]
        
        agent_counts=[] #to calculate the number of agents on a record
        
        while len(myArrayX) > maxLenOfArrayX:
            #################### INITIALISATION PHASE #######################

            #generating random id for picking a model
            id = np.random.randint(len(myArrayX)-1, size = 1)
            

            #taking a model from the search space
            model = myArrayX[id,:].copy()
            
            #deleting the model from search space
            myArrayX_copy= np.delete(myArrayX, id[0], axis = 0)
            
            #generatings ids for agents
            idx = np.random.randint(len(myArrayX)-1, size = numAgents)
            
            #assigning agents from search space
            agents =  myArrayX_copy[idx, :].copy()
            
            #setting inital status for all agents as 'active' 
            agents_status = np.array(['active']).repeat(len(agents))
    
            ################## END OF INITIALISATION PHASE #########################
            
            for numIterations in range(0, numIterations):   
                ################### TEST PHASE ################################
                
                for i in range(len(agents)):
                    
                    #generating random index for comparing dimesions
                    j = np.random.randint(myArrayX.shape[1], size = 1)
                    
                    if agents[i][j] - model[0][j] > threshold: #checking if agent's jth dimension is within threshold of model's jth
                        
                        #resetting status as 'inactive' if it is greater than threshold
                        agents_status[i] = 'inactive' 
                
                ################# END OF TEST PHASE ##################################
                 
                
            ############### DIFFUSION PHASE ###################
                for i in range(len(agents_status)):
                    
                    if agents_status[i] == 'inactive':
                       
                        #generating a random number for inactive agent to pick another agent
                        random_pick = np.random.randint(len(agents), size = 1) 
                        
                        if agents_status[random_pick] == 'active':
                           
                            #if the picked agent is active, the inactive agent moves to active agent
                            agents[i] = agents[random_pick] 
                        
                        else:
                           
                            random_pick2=np.random.randint(len(myArrayX), size = 1)
                            
                            #if picked agent is inactive too, the inactive agent picks a random record from search space
                            agents[i] = myArrayX[random_pick2].copy()
            
            agent,counts = np.unique(agents, return_counts = True,axis=0)
            max_count_id=np.argmax(counts) #agent that has maximum counts
            if max(counts)>2:
                agent_counts.append(max(counts)) #appending the count to a the list
                models.append(model)
                remove_id=np.where(np.equal(myArrayX,agent[max_count_id]).all(axis=1)==True)[0][0]
                myArrayX=np.delete(myArrayX,remove_id, axis=0)#deleting the record on where there are many agents from search space
                myArrayX=np.delete(myArrayX,id,axis=0)
                        
    
        return myArrayX,models,agent_counts

In [4]:
from sklearn.datasets import load_digits

In [270]:
logReg=LogisticRegression(C=1)
svc=SVC(kernel='linear',C=0.4,gamma=0.4)

In [252]:
X,y=load_digits(return_X_y=True)

In [7]:
X.shape

(1797, 64)

In [8]:
y.shape

(1797,)

In [9]:
np.bincount(y)

array([178, 182, 177, 183, 181, 182, 181, 179, 174, 180], dtype=int64)

In [257]:
y=np.where(y==1,1,0) #converting dataset to imbalanced dataset

In [258]:
np.bincount(y) #now we have an imbalanced dataset with 1615 instances of class 0 and 182 instances of class 1

array([1615,  182], dtype=int64)

In [265]:
X_train,X_test,y_train,y_test=train_test_split(X,y, random_state=0) #splitting the dataset into training and test data

In [266]:
svc.fit(X_train,y_train)
svc_score=svc.score(X_test,y_test)
print(svc_score)

0.904444444444


In [267]:
y_pred=svc.predict(X_test)

In [268]:
y_pred #though the classifier achieved 90.44 percent accuracy, it predicted every record in test data as 0

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [269]:
print(classification_report(y_pred,y_test)) #precision and recall for class one is 0. You should not always rely on accuracy

             precision    recall  f1-score   support

          0       1.00      0.90      0.95       450
          1       0.00      0.00      0.00         0

avg / total       1.00      0.90      0.95       450



  'recall', 'true', average, warn_for)


In [17]:
y_pred_prob=svc.decision_function(X_test)

In [18]:
tpr,tnr,thresholds=roc_curve(y_score=y_pred_prob,y_true=y_test)

In [19]:
auc(tpr,tnr) #area under curve is 0.5 which shows our classifier is performing bad

0.5

Using SDS we will try to improve the classifier performance

In [20]:
X_maxclass=X[np.where(y==0)[0]] #minority class

In [21]:
X_maxclass.shape

(1615, 64)

In [22]:
X_minclass=X[np.where(y==1)[0]] #majority class

In [23]:
X_minclass.shape

(182, 64)

In [159]:
X_SDS, models,agent_counts= SDS(X_maxclass, threshold = 0.4, maxLenOfArrayX = 400, numIterations = 50, numAgents = 100)

In [160]:
X_SDS.shape #we have reduced majority class to 400 samples using SDS through partial evaluation

(399, 64)

In [161]:
len(models)

608

In [174]:
model_array=np.array(models).squeeze()

In [272]:
X_maxclass=np.concatenate([X_SDS,model_array],axis=0) #ignoring the similar records for the model_ids

In [273]:
X=np.concatenate([X_maxclass,X_minclass],axis=0)

In [274]:
X.shape

(1189, 64)

In [275]:
y_0=np.zeros(X_maxclass.shape[0],dtype=np.int)

In [276]:
y_1=np.ones(X_minclass.shape[0],dtype=np.int)

In [277]:
y=np.concatenate([y_0,y_1],axis=0)

In [278]:
smote=SMOTE(random_state=0)
X,y=smote.fit_sample(X,y) #oversampling the majority class

In [279]:
np.bincount(y)

array([1007, 1007], dtype=int64)

In [280]:
X_train,X_test,y_train,y_test=train_test_split(X,y, random_state=0) 

In [281]:
svc.fit(X_train,y_train)

SVC(C=0.4, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.4, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [282]:
svc.score(X_train,y_train)

1.0

In [284]:
svc_score2=svc.score(X_test,y_test)
print(svc_score2)

0.984126984127


In [286]:
y_pred=svc.predict(X_test)

In [287]:
y_pred

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0,

In [290]:
print(classification_report(y_pred,y_test)) #it performs considerably better than previous outcomes

             precision    recall  f1-score   support

          0       0.97      1.00      0.98       251
          1       1.00      0.97      0.98       253

avg / total       0.98      0.98      0.98       504

