# Exploratory Data Analysis

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import copy 

In [2]:
new_gan_samples =  pd.read_csv('../data/dirichlet_gan_samples.csv',header=None  ).values # 
new_training_samples =  pd.read_csv('../data/dirichlet_100spec.csv',header=None  ).values # 


In [None]:
phylo_ds =  pd.read_csv('../data/phylogeny_data/simulated_otu_table_clean.csv',header=None  ).values # 

gan_ds =  pd.read_csv('../data/phylogeny_data/phylo_gan_samples.csv',header=None  ).values # 

In [None]:
np.sum(phylo_ds[6,:])

# L2 Norm of mean and STD vectors

In [None]:
train_std_vec = np.std(phylo_ds, axis=0 )#each element represent one column

gan_std_vec = np.std(gan_ds, axis=0)


print("\nL2 Train std")
print(np.linalg.norm(train_std_vec, ord=2))

print("\nL2 GAN std")
print(np.linalg.norm(gan_std_vec, ord=2))


In [None]:
train_mean_vec = np.mean(phylo_ds, axis=0 )#each element represent one column

 
gan_mean_vec = np.mean(gan_ds, axis=0)


print("\nL2 Train mean")
print(np.linalg.norm(train_mean_vec, ord=2))

print("\nL2 GAN mean")
print(np.linalg.norm(gan_mean_vec, ord=2))


# Counting elements greater than $\epsilon$

In [None]:
alt_train = copy.copy(phylo_ds)

alt_train[alt_train==0] = 3

alt_gan = copy.copy(gan_ds)

alt_gan[alt_gan==0] = 3

In [None]:
def summarize_epsilon(data, epsilon):
    print("Shape: "+str(data.shape))

    train_greater = np.sum(data > epsilon, axis = 0)

    print("\n Number of elements greater than epsilon")
    # print(train_greater)
    a1 = np.sum(train_greater)
    print(a1)

    percent = 100*(a1/(alt_train.shape[1]*alt_train.shape[0]))
    print("As % of data: "+str(percent))

In [None]:
epsilon =  10e-7
print("Nota: Los ceros fueron parchados")
print("Epsilon: "+str(epsilon))
print("\n")

summarize_epsilon(alt_train, epsilon)

summarize_epsilon(alt_gan, epsilon)

In [None]:
new_samples

summarize_epsilon(new_samples, epsilon)

In [None]:
print(new_samples.shape)

s1 = new_samples[0,:]
print(s1)
np.sum(s1)

In [None]:
c_gan = copy.copy(new_samples)

In [None]:
np.sum(c_gan > 0.0001)

# Making Some Histograms

In [None]:
import seaborn as sns
sns.set(color_codes=True)

In [None]:
new_training_samples.shape

In [None]:
 
k = np.random.randint(new_gan_samples.shape[0])
print(k)
sub_data = new_training_samples[k,:]
sub_gan = new_gan_samples[k,:]
print(sub_data.shape)
sns.distplot(sub_data,bins=20, kde=False)
sns.distplot(sub_gan,bins=20, kde=False)

In [None]:
average_of_samples = np.sum(new_training_samples, axis=0) #//new_training_samples.shape[0]

In [None]:
normalized = average_of_samples/ np.linalg.norm(average_of_samples, ord=2)

In [None]:
np.sum(normalized)

In [None]:
np.sqrt(np.sum(average_of_samples**2))

In [None]:
np.linalg.norm(normalized, ord=2)

# Dirichlet Testing

In [4]:
K = np.ones(100) # This was the alpha vector used to generate the data

norm_c = np.linalg.norm(K, ord=2)
norm_k = K/norm_c
print(norm_k)
print(norm_c)

[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
10.0


So, we have to test if data is on the simplex described by the above numbers

In [5]:
p1= new_training_samples[0,:]

In [8]:
def mean_dir(point ):   
    return np.sum(point, axis=0)/point.shape[0]

In [13]:
np.sum(new_training_samples, axis=0)/new_training_samples.shape[0]

array([0.0099875 , 0.01003651, 0.01005461, 0.00997314, 0.00998334,
       0.00994712, 0.00996554, 0.01000543, 0.00993231, 0.00995206,
       0.01001143, 0.00998412, 0.01001896, 0.00999984, 0.00998962,
       0.00997912, 0.01007215, 0.01000925, 0.01003189, 0.01002811,
       0.00996221, 0.00992988, 0.01005335, 0.01003878, 0.00998893,
       0.01000134, 0.01001897, 0.0100137 , 0.01009487, 0.01006854,
       0.01000817, 0.01001273, 0.01008953, 0.01005597, 0.009919  ,
       0.01002267, 0.00993101, 0.00999787, 0.01008484, 0.00998836,
       0.01001259, 0.0099418 , 0.01002017, 0.01000241, 0.00995607,
       0.00997489, 0.01003136, 0.00996973, 0.01002616, 0.01000619,
       0.00999748, 0.01001442, 0.00996861, 0.0100293 , 0.01003508,
       0.01001499, 0.01005327, 0.00994187, 0.00995411, 0.01000185,
       0.00996092, 0.01001032, 0.0099979 , 0.01001819, 0.01005453,
       0.00999876, 0.0099688 , 0.00997128, 0.01006555, 0.0100433 ,
       0.01000842, 0.01004547, 0.00999212, 0.00999569, 0.00999

In [14]:
np.sum(new_gan_samples, axis=0)/new_gan_samples.shape[0]

array([0.01006562, 0.01060201, 0.00939326, 0.01066261, 0.00839568,
       0.00867846, 0.00986383, 0.00892806, 0.00922239, 0.01042323,
       0.01241126, 0.00885244, 0.01135979, 0.01007721, 0.00952929,
       0.01038234, 0.00929688, 0.01146677, 0.01178913, 0.00889552,
       0.01077789, 0.01066141, 0.01132662, 0.01254065, 0.00970659,
       0.00947724, 0.0093698 , 0.00739294, 0.00969986, 0.01285481,
       0.00937918, 0.01069378, 0.00932177, 0.00928982, 0.01018118,
       0.01037383, 0.00944713, 0.01054213, 0.01063923, 0.0090338 ,
       0.01074361, 0.01037796, 0.01035835, 0.01001522, 0.00980194,
       0.00775463, 0.01167995, 0.00846307, 0.01153496, 0.00901075,
       0.00991259, 0.01049729, 0.01096462, 0.00910769, 0.01185234,
       0.0101241 , 0.01097751, 0.00877353, 0.00893083, 0.0099851 ,
       0.00880109, 0.01061159, 0.01036288, 0.01144762, 0.00914435,
       0.00994262, 0.01064267, 0.0103603 , 0.00698218, 0.00967797,
       0.00836856, 0.00885994, 0.01232335, 0.01006076, 0.00939