In [91]:
from sklearn.datasets import load_iris
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from math import sqrt
from math import pi
from math import exp
from sklearn.utils import shuffle
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [92]:
#Getting the data
iris = load_iris()
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.7,3.0,4.2,1.2,1.0
1,4.9,2.4,3.3,1.0,1.0
2,6.7,3.3,5.7,2.1,2.0
3,6.4,2.7,5.3,1.9,2.0
4,4.6,3.2,1.4,0.2,0.0
...,...,...,...,...,...
145,6.5,3.0,5.5,1.8,2.0
146,6.1,2.6,5.6,1.4,2.0
147,5.0,2.3,3.3,1.0,1.0
148,5.4,3.7,1.5,0.2,0.0


In [93]:
########### PCA ##############
#scalar = StandardScaler()
#scalar.fit(iris['data']) 
#scaled_data = scalar.transform(iris['data'])

  

pca = PCA(n_components = 4) 
pca.fit(iris['data']) 
x_pca = pca.transform(iris['data']) 
df2 = pd.DataFrame(data= np.c_[x_pca, iris['target']],
                     columns= iris['feature_names'] + ['target'])

#dataframe that is preproccesed with PCA
df2 = shuffle(df2)
df2.reset_index(inplace=True, drop=True)
df2

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,-2.849369,-0.940961,-0.349230,-0.319987,0.0
1,-2.562320,0.367719,-0.078494,0.014173,0.0
2,2.144243,0.140064,0.734879,-0.055542,2.0
3,-2.199820,0.872839,-0.120306,-0.027052,0.0
4,0.166413,-0.681927,-0.060009,-0.029622,1.0
...,...,...,...,...,...
145,0.230548,-0.404386,-0.229410,-0.016930,1.0
146,-2.590006,0.229044,-0.080082,0.013749,0.0
147,1.390189,-0.282661,0.362910,0.155039,2.0
148,1.900942,0.116628,0.723252,-0.044595,2.0


In [94]:
#gaussian probabilty function
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [95]:
#kfold cross validation function
def kfold(n,dataset):
    accs=[]
    preccs=[]
    recs=[]
    f1s=[]
    subset=[]
    #spliting the data n-ways for n-times and testing with each part as a test-set and the rest as a Training set
    for i in range(n):
        t0=[]
        t1=[]
        t2=[]
        splits=np.array_split(dataset, n)
        
        test=splits.pop(i)
        testDF=pd.DataFrame(test)
        
        trainDF=pd.DataFrame(splits[0])
        for i in range(1,n-1):
            trainDF.merge(pd.DataFrame(splits[i]))
        
        
        #preparing the training data and the model
        t0=trainDF[trainDF['target']==0]
        t1=trainDF[trainDF['target']==1]
        t2=trainDF[trainDF['target']==2]
        
        t0_s=[[mean(t0['sepal length (cm)'])],[mean(t0['sepal width (cm)'])],[mean(t0['petal length (cm)'])],[mean(t0['petal width (cm)'])]]
        t1_s=[[mean(t0['sepal length (cm)'])],[mean(t1['sepal width (cm)'])],[mean(t1['petal length (cm)'])],[mean(t1['petal width (cm)'])]]
        t2_s=[[mean(t0['sepal length (cm)'])],[mean(t2['sepal width (cm)'])],[mean(t2['petal length (cm)'])],[mean(t2['petal width (cm)'])]]
        
        stats=[t0_s,t1_s,t2_s]
        stats[0][0].append(std(t0['sepal length (cm)']))
        stats[0][1].append(std(t0['sepal width (cm)']))
        stats[0][2].append(std(t0['petal length (cm)']))
        stats[0][3].append(std(t0['petal width (cm)']))

        stats[1][0].append(std(t0['sepal length (cm)']))
        stats[1][1].append(std(t0['sepal width (cm)']))
        stats[1][2].append(std(t0['petal length (cm)']))
        stats[1][3].append(std(t0['petal width (cm)']))

        stats[2][0].append(std(t0['sepal length (cm)']))
        stats[2][1].append(std(t0['sepal width (cm)']))
        stats[2][2].append(std(t0['petal length (cm)']))
        stats[2][3].append(std(t0['petal width (cm)']))
        pred=[]
        
        #predicting the test-set
        for k in range(testDF.shape[0]):
            c1=0
            c2=0
            c3=0
            
            subject=testDF.iloc[k]
            
            c1+=calculate_probability(subject['sepal length (cm)'], stats[0][0][0], stats[0][0][1])
            c1+=calculate_probability(subject['sepal width (cm)'], stats[0][1][0], stats[0][1][1])
            c1+=calculate_probability(subject['petal length (cm)'], stats[0][2][0], stats[0][2][1])
            c1+=calculate_probability(subject['petal width (cm)'], stats[0][3][0], stats[0][3][1])
            
            c2+=calculate_probability(subject['sepal length (cm)'], stats[1][0][0], stats[1][0][1])
            c2+=calculate_probability(subject['sepal width (cm)'], stats[1][1][0], stats[1][1][1])
            c2+=calculate_probability(subject['petal length (cm)'], stats[1][2][0], stats[1][2][1])
            c2+=calculate_probability(subject['petal width (cm)'], stats[1][3][0], stats[1][3][1])
            
            c3+=calculate_probability(subject['sepal length (cm)'], stats[2][0][0], stats[2][0][1])
            c3+=calculate_probability(subject['sepal width (cm)'], stats[2][1][0], stats[2][1][1])
            c3+=calculate_probability(subject['petal length (cm)'], stats[2][2][0], stats[2][2][1])
            c3+=calculate_probability(subject['petal width (cm)'], stats[2][3][0], stats[2][3][1])
            
            #Comparing the probabilties of a element being in a specific group
            if(c1>c2 and c1>c3):
                pred.append(0)
            elif(c2>c1 and c2>c3):
                pred.append(1)
            else:
                pred.append(2)
            
        subscore=0  
        n1=0
        n0=0
        n2=0
        #comparing the predictions with actual values
        for p in range(len(pred)):  
            if(pred[p]==testDF['target'].iloc[p]):
                subscore+=1
            if(pred[p]==0 and testDF['target'].iloc[p]==0):
                n0+=1
            if(pred[p]==1 and testDF['target'].iloc[p]==1):
                n1+=1
            if(pred[p]==2 and testDF['target'].iloc[p]==2):
                n2+=1
        acc=subscore/len(pred)    
        rec=mean([n0/Counter(testDF['target'])[0],n1/Counter(testDF['target'])[1],n2/Counter(testDF['target'])[2]])
        precc=mean([n0/Counter(pred)[0],n1/Counter(pred)[1],n2/Counter(pred)[2]])
        f1= 2*(precc*rec)/(precc+rec)
        
        
        accs.append(acc)#accuracy
        recs.append(rec)#recall
        preccs.append(precc)#precision
        f1s.append(f1)#f1
        
    return mean(accs),mean(recs),mean(preccs),mean(f1s)



In [96]:
#taking the mean and std of scores 
acc1,recall1,prec1,f1_1=kfold(10,df)
print("Accuracy:{:.3f} | Recall:{:.3f} | Precision:{:.3f} | F1 Score {:.3f}".format(acc1,recall1,prec1,f1_1))


Accuracy:0.840 | Recall:0.852 | Precision:0.855 | F1 Score 0.853


In [97]:

#taking the mean and std of scores 
acc1,recall1,prec1,f1_1=kfold(10,df2)
print("Accuracy:{:.3f} | Recall:{:.3f} | Precision:{:.3f} | F1 Score {:.3f}".format(acc1,recall1,prec1,f1_1))


Accuracy:0.427 | Recall:0.432 | Precision:0.458 | F1 Score 0.443
