# CANDIDATE ELIMINATION ALGORITHM


In [1]:
import numpy as np
import pandas as pd

In [2]:
data_link = './titanic.csv'
raw_tdf   = pd.read_csv(data_link)
print(raw_tdf.head())

   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  


In [3]:
print(raw_tdf.columns)

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')


In [4]:
#drop name column
raw_tdf=raw_tdf.drop(['Name', 'Age','Fare'] , axis=1)
raw_tdf

Unnamed: 0,Survived,Pclass,Sex,Siblings/Spouses Aboard,Parents/Children Aboard
0,0,3,male,1,0
1,1,1,female,1,0
2,1,3,female,0,0
3,1,1,female,1,0
4,0,3,male,0,0
...,...,...,...,...,...
882,0,2,male,0,0
883,1,1,female,0,0
884,0,3,female,1,2
885,1,1,male,0,0


In [5]:
#mapping sex column 
''' male->0 female->1'''
mapping_df_sex = {'male':0 , 'female':1}
raw_tdf['Sex']= raw_tdf['Sex'].map(mapping_df_sex)
print(raw_tdf)

     Survived  Pclass  Sex  Siblings/Spouses Aboard  Parents/Children Aboard
0           0       3    0                        1                        0
1           1       1    1                        1                        0
2           1       3    1                        0                        0
3           1       1    1                        1                        0
4           0       3    0                        0                        0
..        ...     ...  ...                      ...                      ...
882         0       2    0                        0                        0
883         1       1    1                        0                        0
884         0       3    1                        1                        2
885         1       1    0                        0                        0
886         0       3    0                        0                        0

[887 rows x 5 columns]


In [6]:
training_data = raw_tdf.iloc[30:35 ,:]
print(training_data)

    Survived  Pclass  Sex  Siblings/Spouses Aboard  Parents/Children Aboard
30         0       1    0                        0                        0
31         1       1    1                        1                        0
32         1       3    1                        0                        0
33         0       2    0                        0                        0
34         0       1    0                        1                        0


In [7]:
testing_data = raw_tdf.iloc[5:10 , :]
testing_data

Unnamed: 0,Survived,Pclass,Sex,Siblings/Spouses Aboard,Parents/Children Aboard
5,0,3,0,0,0
6,0,1,0,0,0
7,0,3,0,3,1
8,1,3,1,0,2
9,1,2,1,1,0


In [8]:
testing_true_values = testing_data['Survived']
testing_true_values

5    0
6    0
7    0
8    1
9    1
Name: Survived, dtype: int64

In [9]:
testing_xis = testing_data.drop(['Survived'] , axis=True)
testing_xis

Unnamed: 0,Pclass,Sex,Siblings/Spouses Aboard,Parents/Children Aboard
5,3,0,0,0
6,1,0,0,0
7,3,0,3,1
8,3,1,0,2
9,2,1,1,0


In [10]:
training_features = training_data.iloc[:,1:]
print(training_features)

    Pclass  Sex  Siblings/Spouses Aboard  Parents/Children Aboard
30       1    0                        0                        0
31       1    1                        1                        0
32       3    1                        0                        0
33       2    0                        0                        0
34       1    0                        1                        0


In [11]:
training_target = training_data['Survived']
print(training_target)

30    0
31    1
32    1
33    0
34    0
Name: Survived, dtype: int64


In [12]:
def candidate_elimination(features , targets):
    specific_h =None
    for idx,val in enumerate(targets):
        if val==1:
            specific_h = features[idx]
            break
    general_h = [ ['?' for i in range(len(specific_h)) ] for j in range(len(specific_h)) ]
    print('Specific_hypothesis',specific_h , end="\n\n")
    print('General_hypothesis',general_h , end="\n\n")
    
    #its training time
    for idx , val in enumerate(features):
        if targets[idx]==1:
            if i in range(len(specific_h)):
                if specific_h[i]==val[i]:
                    #do nothing 
                    pass
                else :
                    #generalize 
                    #find-s algo basically
                    specific_h[i]='?'
                    general_h[i][i]='?'
        if targets[idx]==0 : #negative example found
            for i in range(len(specific_h)):
                if val[i]==specific_h[i]:
                    #generalize
                    general_h[i][i]='?'
                else :
                    #specific update in general hypothesis
                    general_h[i][i]=specific_h[i]
            
    return specific_h , general_h

In [13]:
def train(x,y):
    features = np.array(x)
    targets = np.array(y)
    specific_h , general_h = candidate_elimination(features,targets)
    
    quest_list = ['?' for i in range(len(general_h))]
    indx = [i for i ,val in enumerate(general_h) if val==quest_list ]
    for i in indx :
        general_h.remove(quest_list)
    return specific_h , general_h

In [14]:
specific_h , general_h = train(training_features,training_target)
print('After training \n\n\n')
print('Specific Hypothesis :\t',specific_h)
print('General Hypothesis :\t',general_h)

Specific_hypothesis [1 1 1 0]

General_hypothesis [['?', '?', '?', '?'], ['?', '?', '?', '?'], ['?', '?', '?', '?'], ['?', '?', '?', '?']]

After training 



Specific Hypothesis :	 [1 1 1 0]
General Hypothesis :	 [['?', 1, '?', '?']]


# Tesing time

In [15]:
def cealgo_match(xi ,hypothesis) :
    count=0
    lhypo = len(hypothesis)
    for i in range(lhypo):
        if xi[i]==hypothesis[i]:
            count+=1
    return (count/lhypo)

In [16]:
def predict(testing_xi , true_labels , s_hypothesis):
    score = 0
    xlen = len(testing_xi)
    testing_xi = np.array(testing_xi)
    true_lables = np.array(true_labels)
    for i in range(xlen):
        score+= cealgo_match(testing_xi[i] , s_hypothesis )
    return score/xlen

In [17]:
accuracy = predict(testing_xis,testing_true_values , specific_h)

In [18]:
print('Accuracy = {}%'.format(accuracy*100))

Accuracy = 35.0%
