In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy.stats as s

In [2]:
data=pd.read_csv("mushrooms.csv")

In [3]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
for columns in data.columns[1:]:
    data[columns].replace(to_replace=list(data[columns].unique()),value=list(range(0,len(data[columns].unique()))),inplace=True)
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,e,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
2,e,1,0,2,0,2,0,0,1,1,...,0,0,0,0,0,0,0,1,1,2
3,p,0,1,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,e,0,0,3,1,3,0,1,1,0,...,0,0,0,0,0,0,1,1,2,1


In [5]:
data_array=list()
for column in data.columns[1:]:
    data_array.append(np.eye(len(data[column].unique()),len(data[column].unique()))[data[column]])
X=np.concatenate(data_array,axis=1)
X.shape

(8124, 117)

In [6]:
X_dash=X-np.mean(X,axis=0)
X_cov=(1/X.shape[0])*np.matmul(X_dash.T,X_dash)
print(X_cov.shape)
X_cov

(117, 117)


array([[ 2.47502461e-01, -2.50382973e-02, -1.77262282e-03, ...,
        -5.02562327e-03, -2.75784426e-03, -1.45766227e-02],
       [-2.50382973e-02,  5.25420725e-02, -2.19153587e-04, ...,
        -7.58855659e-03, -1.31492152e-03,  7.02794523e-04],
       [-1.77262282e-03, -2.19153587e-04,  3.92343103e-03, ...,
        -5.54671911e-04, -9.30917892e-05, -4.03397753e-04],
       ...,
       [-5.02562327e-03, -7.58855659e-03, -5.54671911e-04, ...,
         1.20987811e-01, -3.32803147e-03, -1.44214697e-02],
       [-2.75784426e-03, -1.31492152e-03, -9.30917892e-05, ...,
        -3.32803147e-03,  2.30751273e-02, -2.42038652e-03],
       [-1.45766227e-02,  7.02794523e-04, -4.03397753e-04, ...,
        -1.44214697e-02, -2.42038652e-03,  9.19242630e-02]])

In [7]:
factored_matrix=np.linalg.svd(X_cov)
Q=factored_matrix[0]
Q_tilda=Q[:,0:3]
X_new=np.matmul(X_dash,Q_tilda)
X_new.shape

(8124, 3)

In [8]:
labels=np.array(data['class']).reshape(data['class'].shape[0],1)
labels

array([['p'],
       ['e'],
       ['e'],
       ...,
       ['e'],
       ['p'],
       ['e']], dtype=object)

In [9]:
new_df=pd.DataFrame(data=X_new)
new_df['class']=labels
new_df

Unnamed: 0,0,1,2,class
0,-0.996245,-0.946892,0.802493,p
1,-1.441733,-0.164154,1.068175,e
2,-1.524085,-0.408746,0.853172,e
3,-1.151800,-0.675820,0.629840,p
4,-0.617503,-0.996143,1.725882,e
5,-1.500324,0.023611,0.702832,e
6,-1.431616,-0.293342,0.904024,e
7,-1.577734,-0.250673,0.669896,e
8,-0.905198,-0.571045,0.547657,p
9,-1.406991,-0.157730,0.930824,e


In [10]:
poissonous_prior=data[data["class"]=="p"].shape[0]
print("poissonous_prior:",poissonous_prior)
non_poissonous_prior=data[data["class"]=="e"].shape[0]
print("non_poissonous_prior:",non_poissonous_prior)

poissonous_prior: 3916
non_poissonous_prior: 4208


In [11]:
training_data_len=int(0.7*new_df.shape[0])
print("training_data_len",training_data_len)
cv_data_len=int(0.2*new_df.shape[0])
print("cv_data_len",cv_data_len)
testing_data_len=int(0.1*new_df.shape[0])
print("testing_data_len",testing_data_len)

training_data_len 5686
cv_data_len 1624
testing_data_len 812


In [12]:
poissonous_prior_data_t=new_df[new_df["class"]=="p"].iloc[0:training_data_len//2]
non_poissonous_prior_data_t=new_df[new_df["class"]=="e"].iloc[0:training_data_len//2]
training_data=pd.concat([poissonous_prior_data_t,non_poissonous_prior_data_t])
training_data.shape

(5686, 4)

In [13]:
poissonous_prior_data_remaining=new_df[new_df["class"]=="p"].iloc[training_data_len//2:]
non_poissonous_prior_data_remaining=new_df[new_df["class"]=="e"].iloc[training_data_len//2:]
remaining_data=pd.concat([poissonous_prior_data_remaining,non_poissonous_prior_data_remaining])
remaining_data.shape
remaining_data=np.array(remaining_data)

In [14]:
random_indices=np.random.choice(np.arange(0,2438),size=(2438,),replace=False)
remaining_data=remaining_data[random_indices]
remaining_data

array([[-0.12294739870270502, -0.47709052548650605, 2.175656869191343,
        'e'],
       [-0.3011212457194453, -1.0398987199517027, 0.8136129559574607,
        'e'],
       [2.1214095462787794, -0.9562872561774101, -0.5579522298862288,
        'p'],
       ...,
       [-1.6387566661801394, -0.09278259610523197, -1.3849261990040374,
        'e'],
       [2.5715272132615703, -0.7631496666797118, -0.20058262231154714,
        'p'],
       [2.347082168250282, -0.7201651772508864, -0.3326035670710436, 'p']],
      dtype=object)

In [15]:
remaining_data_labels=remaining_data[:,3].reshape(remaining_data.shape[0],1)
print("remaining_data_labels",remaining_data_labels)
remaining_data=pd.DataFrame(data=remaining_data[:,0:3])
remaining_data["class"]=remaining_data_labels
remaining_data

remaining_data_labels [['e']
 ['e']
 ['p']
 ...
 ['e']
 ['p']
 ['p']]


Unnamed: 0,0,1,2,class
0,-0.122947,-0.477091,2.17566,e
1,-0.301121,-1.0399,0.813613,e
2,2.12141,-0.956287,-0.557952,p
3,-1.30565,-0.0553802,-1.46016,e
4,-0.0955308,-0.218149,2.01315,e
5,-0.281287,-0.867075,0.566099,e
6,-1.46521,-0.338428,-1.33769,e
7,2.35184,-1.13998,-0.993599,p
8,2.42852,-1.09293,-0.0712099,p
9,-0.243407,-0.933408,0.35877,e


In [16]:
cv_data=remaining_data[0:cv_data_len]
print("cv_data len:",cv_data.shape)
print("cv_data:",cv_data.head())
print("**********************")
testing_data=remaining_data[cv_data_len:]
print("testing_data len:",testing_data.shape)
print("testing_data:",testing_data.head())

cv_data len: (1624, 4)
cv_data:            0          1         2 class
0  -0.122947  -0.477091   2.17566     e
1  -0.301121    -1.0399  0.813613     e
2    2.12141  -0.956287 -0.557952     p
3   -1.30565 -0.0553802  -1.46016     e
4 -0.0955308  -0.218149   2.01315     e
**********************
testing_data len: (814, 4)
testing_data:              0         1          2 class
1624    1.8113  -1.61194   -1.11754     p
1625 -0.211956 -0.302379   0.279856     e
1626   1.50946  -1.88445 -0.0880146     p
1627 -0.519562 -0.850688    2.04426     e
1628   2.02978  -1.25137  -0.168174     p


In [17]:
miu_hat_p=np.array(training_data[training_data["class"]=="p"].iloc[:,0:3].mean())
sigma_hat_p=np.array(training_data[training_data["class"]=="p"].iloc[:,0:3].cov())
print(np.linalg.det(sigma_hat_p))

0.3737782436279567


In [18]:
miu_hat_e=np.array(training_data[training_data["class"]=="e"].iloc[:,0:3].mean())
sigma_hat_e=np.array(training_data[training_data["class"]=="e"].iloc[:,0:3].cov())
print(np.linalg.det(sigma_hat_e))


0.012859141870228779


In [19]:
poisonous_prior=training_data[training_data["class"]=="p"].shape[0]/training_data.shape[0]
non_poisonous_prior=training_data[training_data["class"]=="e"].shape[0]/training_data.shape[0]

In [22]:
def mocktest(data):
    inputs=np.array(data.iloc[:,0:3])
    posterior_p=s.multivariate_normal.pdf(inputs,miu_hat_p,sigma_hat_p)*poisonous_prior
    posterior_e=s.multivariate_normal.pdf(inputs,miu_hat_e,sigma_hat_e)*non_poisonous_prior
    boolean_mask= posterior_p> posterior_e
    predicted_catogory=pd.Series(boolean_mask)
    predicted_catogory.replace(to_replace=[True,False],value=['p','e'],inplace=True)
    return np.array(predicted_catogory)

In [23]:
predicted_results=mocktest(cv_data)
print("predicted_results",predicted_results)
Actual_results=np.array(cv_data['class'])
print("Actual_results",predicted_results)

predicted_results ['e' 'e' 'p' ... 'p' 'p' 'p']
Actual_results ['e' 'e' 'p' ... 'p' 'p' 'p']


In [24]:
Accuracy=np.count_nonzero(predicted_results==Actual_results)/Actual_results.shape[0]
Accuracy

0.8238916256157636