# CANDIDATE ELIMINATION ALGORITHM


In [1]:
import numpy as np
import pandas as pd

In [2]:
data_link = './titanic.csv'
raw_tdf   = pd.read_csv(data_link)
print(raw_tdf.head())

   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  


In [3]:
print(raw_tdf.columns)

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')


In [4]:
def encode_age(agedf):
    new_age=[]
    for i in agedf :
        if i<=18 :
            new_age.append(0)
            continue
        elif i>18 and i<=50 :
            new_age.append(1)
            continue
        else :
            new_age.append(2)
    return new_age

In [5]:
raw_tdf['Age']=pd.Series(encode_age(raw_tdf['Age']))
print(raw_tdf['Age'])

0      1
1      1
2      1
3      1
4      1
      ..
882    1
883    1
884    0
885    1
886    1
Name: Age, Length: 887, dtype: int64


In [6]:
#drop  column
raw_tdf=raw_tdf.drop(['Name','Fare'] , axis=1)
raw_tdf

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard
0,0,3,male,1,1,0
1,1,1,female,1,1,0
2,1,3,female,1,0,0
3,1,1,female,1,1,0
4,0,3,male,1,0,0
...,...,...,...,...,...,...
882,0,2,male,1,0,0
883,1,1,female,1,0,0
884,0,3,female,0,1,2
885,1,1,male,1,0,0


In [7]:
#mapping sex column 
''' male->0 female->1'''
mapping_df_sex = {'male':0 , 'female':1}
raw_tdf['Sex']= raw_tdf['Sex'].map(mapping_df_sex)
print(raw_tdf)

     Survived  Pclass  Sex  Age  Siblings/Spouses Aboard  \
0           0       3    0    1                        1   
1           1       1    1    1                        1   
2           1       3    1    1                        0   
3           1       1    1    1                        1   
4           0       3    0    1                        0   
..        ...     ...  ...  ...                      ...   
882         0       2    0    1                        0   
883         1       1    1    1                        0   
884         0       3    1    0                        1   
885         1       1    0    1                        0   
886         0       3    0    1                        0   

     Parents/Children Aboard  
0                          0  
1                          0  
2                          0  
3                          0  
4                          0  
..                       ...  
882                        0  
883                        0  


In [24]:
training_data = raw_tdf.iloc[110:125 ,:]
print(training_data)

     Survived  Pclass  Sex  Age  Siblings/Spouses Aboard  \
110         0       3    1    0                        1   
111         0       3    0    1                        0   
112         0       3    1    1                        1   
113         0       3    1    0                        0   
114         0       3    0    1                        0   
115         0       3    0    2                        0   
116         0       2    0    1                        1   
117         0       1    0    1                        0   
118         0       3    1    0                        4   
119         0       2    0    1                        2   
120         0       3    0    1                        0   
121         0       2    0    1                        1   
122         1       2    1    1                        0   
123         0       1    0    2                        0   
124         1       3    0    0                        1   

     Parents/Children Aboard  
110     

In [25]:
testing_data = raw_tdf.iloc[5:10 , :]
testing_data

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard
5,0,3,0,1,0,0
6,0,1,0,2,0,0
7,0,3,0,0,3,1
8,1,3,1,1,0,2
9,1,2,1,0,1,0


In [26]:
testing_true_values = testing_data['Survived']
testing_true_values

5    0
6    0
7    0
8    1
9    1
Name: Survived, dtype: int64

In [27]:
testing_xis = testing_data.drop(['Survived'] , axis=True)
testing_xis

Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard
5,3,0,1,0,0
6,1,0,2,0,0
7,3,0,0,3,1
8,3,1,1,0,2
9,2,1,0,1,0


In [28]:
training_features = training_data.iloc[:,1:]
print(training_features)

     Pclass  Sex  Age  Siblings/Spouses Aboard  Parents/Children Aboard
110       3    1    0                        1                        0
111       3    0    1                        0                        0
112       3    1    1                        1                        0
113       3    1    0                        0                        0
114       3    0    1                        0                        0
115       3    0    2                        0                        0
116       2    0    1                        1                        0
117       1    0    1                        0                        1
118       3    1    0                        4                        2
119       2    0    1                        2                        0
120       3    0    1                        0                        0
121       2    0    1                        1                        0
122       2    1    1                        0                  

In [29]:
training_target = training_data['Survived']
print(training_target)

110    0
111    0
112    0
113    0
114    0
115    0
116    0
117    0
118    0
119    0
120    0
121    0
122    1
123    0
124    1
Name: Survived, dtype: int64


In [30]:
def candidate_elimination(features , targets):
    specific_h =None
    for idx,val in enumerate(targets):
        if val==1:
            specific_h = features[idx]
            break
    general_h = [ [-1 for i in range(len(specific_h)) ] for j in range(len(specific_h)) ]
    print('Specific_hypothesis',specific_h , end="\n\n")
    print('General_hypothesis',general_h , end="\n\n")
    
    #training 
    for idx , val in enumerate(features):
        if targets[idx]==1:
            for j in range(len(specific_h)):
                if specific_h[j]==val[j]:
                    #do nothing 
                    pass
                else :
                    #generalize 
                    #find-s algo basically
                    specific_h[j]=-1
                    general_h[j][j]=-1
                    
        if targets[idx]==0 : #negative example found
            for i in range(len(specific_h)):
                if val[i]==specific_h[i]:
                    #generalize
                    general_h[i][i]=-1
                else :
                    #specific update in general hypothesis
                    general_h[i][i]=specific_h[i]
            
    return specific_h , general_h

In [31]:
def train(x,y):
    features = np.array(x)
    targets = np.array(y)
    specific_h , general_h = candidate_elimination(features,targets)
    
    quest_list = [-1 for i in range(len(general_h))]
    indx = [i for i ,val in enumerate(general_h) if val==quest_list ]
    for i in indx :
        general_h.remove(quest_list)
    return specific_h , general_h

In [32]:
specific_h , general_h = train(training_features,training_target)
print('After training \n\n\n')
print('Specific Hypothesis :\t',specific_h)
print('General Hypothesis :\t',general_h)


Specific_hypothesis [2 1 1 0 0]

General_hypothesis [['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?']]

After training 



Specific Hypothesis :	 [-1 -1 -1 -1  0]
General Hypothesis :	 [[-1, '?', '?', '?', '?'], ['?', -1, '?', '?', '?'], ['?', '?', -1, '?', '?'], ['?', '?', '?', -1, '?'], ['?', '?', '?', '?', 0]]


<strong>Note: -1 means '?'

# Tesing time

In [20]:
def cealgo_match(xi ,hypothesis) :
    count=0
    lhypo = len(hypothesis)
    for i in range(lhypo):
        if xi[i]==hypothesis[i]:
            count+=1
    return (count/lhypo)

In [21]:
def predict(testing_xi , true_labels , s_hypothesis):
    score = 0
    xlen = len(testing_xi)
    testing_xi = np.array(testing_xi)
    true_lables = np.array(true_labels)
    for i in range(xlen):
        score+= cealgo_match(testing_xi[i] , s_hypothesis )
    return score/xlen

In [22]:
accuracy = predict(testing_xis,testing_true_values , specific_h)

In [23]:
print('Accuracy = {}%'.format(int(accuracy*100)))

Accuracy = 8%
