In [18]:
import pandas as pd
import numpy as np

from sklearn.mixture import GaussianMixture as GMM

# pandas dataframe df has a column 'cls'
# we will extract unique classes from
df = pd.read_csv('synthesized_features_guassian.csv')
arr_classes = np.sort(df.cls.unique())

# new we train a unique gmm for each class
# and save results in a dictionary
gmms = dict()
for cls in arr_classes:
    print('class', cls)
    
    # extract samples for the class and convert to numpy
    # and remove the class label which is the last column in the dataframe
    x_train = df[df['cls'] == cls].values[:, :-1]
    
    # train gmm, extract results, and save in dictionary
    gmm = GMM(n_components=1, covariance_type = 'full').fit(x_train)
    gmms[cls] = (gmm.means_, gmm.covariances_)
    print(gmms[cls])



class 0
(array([[13.625 ,  9.6875,  1.1875,  1.5625,  0.875 ,  0.75  ,  1.125 ,
         1.5   ,  1.25  ,  1.4375,  0.125 ,  0.    ,  0.    ,  0.1875,
         0.    ,  0.    ,  0.0625,  0.    ,  0.    ,  0.    ,  0.    ,
         0.    ,  0.0625,  0.    ,  0.    ,  0.1875,  0.    ,  0.    ,
         0.1875,  0.    ,  0.    ,  0.1875,  0.    ,  0.    ]]), array([[[ 9.91093760e+01, -8.55468750e+00,  1.63281250e+00, ...,
         -6.17187500e-01,  0.00000000e+00,  0.00000000e+00],
        [-8.55468750e+00,  9.58984475e+00,  1.21093750e-01, ...,
          8.08593750e-01,  0.00000000e+00,  0.00000000e+00],
        [ 1.63281250e+00,  1.21093750e-01,  2.77344750e-01, ...,
         -3.51562500e-02,  0.00000000e+00,  0.00000000e+00],
        ...,
        [-6.17187500e-01,  8.08593750e-01, -3.51562500e-02, ...,
          1.52344750e-01,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  1.00000000e-06,  0.00000000e+00],

In [19]:
import numpy.linalg as lnalg
from scipy.spatial.distance import mahalanobis

# first we will calculate average MD distance for each class
# we will use it later for a saliency check

for cls in arr_classes:
    # let's randomly select 1000 samples
    sub_sample = df[df['cls']==cls].sample(frac=1).values[:1000, :-1]
    
    mh = np.empty((sub_sample.shape[0]))
    mu, sig = gmms[cls]
    # we have to invert the matrix for mahalanobis calc
    isig = lnalg.inv(sig)
    
    for i in range(sub_sample.shape[0]):
        mh[i] = mahalanobis(sub_sample[i], mu, isig)
        print("The mahalanobis distance=",mh[i])
        
    print('class', cls, 'mean', mh.mean())

The mahalanobis distance= 1.8256977775848064
The mahalanobis distance= 3.790695509138361
The mahalanobis distance= 1.6066576883998307
The mahalanobis distance= 3.872938918689582
The mahalanobis distance= 3.8728505682867285
The mahalanobis distance= 3.872933322014574
The mahalanobis distance= 3.8729764866743777
The mahalanobis distance= 3.8729723299121246
The mahalanobis distance= 3.872980037283857
The mahalanobis distance= 2.801853802412378
The mahalanobis distance= 1.484873474367891
The mahalanobis distance= 3.790808289484413
The mahalanobis distance= 3.5329881091433544
The mahalanobis distance= 3.055626995288521
The mahalanobis distance= 3.8728654779191
The mahalanobis distance= 2.1123180382405495
class 0 mean 3.194502301552528
The mahalanobis distance= 5.567305156754006
The mahalanobis distance= 2.4005451367675814
The mahalanobis distance= 4.3703612696452
The mahalanobis distance= 3.351112736168968
The mahalanobis distance= 3.293497119670171
The mahalanobis distance= 2.5169558180552

In [20]:
inv_sig = dict()
mh = np.zeros((arr_classes.shape[0]))

smpls = df.sample(frac=1).values[:1000]
x = smpls[:, :-1]
labels = smpls[:, -1:]
results = np.zeros(x.shape[0])

# let's invert all sigmas from GMMs
for cls in arr_classes:
    mu,sig = gmms[cls]
    isig = lnalg.inv(sig)
    inv_sig[cls] = mu, isig

for i in range (x.shape[0]):
    for cls in arr_classes:
        mu, isig = inv_sig[cls]
        mh[cls] = mahalanobis(x[i], mu, isig)
        print(cls)
        print(mh[cls])
        print("***")
       
    # if gmm prediction matches the original class label let's save the result
    print(np.argmin(mh))
    print("###")
    if np.argmin(mh) == labels[i]:
        results[i] = 1
        
# let's calculate simple accuracy: True / All Samples
acc = results.sum() / results.shape[0]

print('Accuracy:', acc)
if acc > .9: print('Doing well')

0
1.6066576883998307
***
1
13.533039549875724
***
2
35337.66082093366
***
0
###
0
3.8729723299121246
***
1
1732.116467640881
***
2
39319.21158996108
***
0
###
0
465.1279937776531
***
1
2.5169558180552136
***
2
8930.285899803974
***
1
###
0
800.819393612747
***
1
3.1239415480426547
***
2
6837.397330199054
***
1
###
0
3376.367568232486
***
1
3.348756661789607
***
2
18547.240253251926
***
1
###
0
2437.327785978612
***
1
5.437768091025985
***
2
24392.622573648256
***
1
###
0
3.055626995288521
***
1
12.98662843048628
***
2
12922.848138299176
***
0
###
0
465.15497610595526
***
1
3.8121368962643687
***
2
7810.249932078604
***
1
###
0
1137.8814963682657
***
1
3.351112736168968
***
2
38128.07679784612
***
1
###
0
835.0528943078734
***
1
3.293497119670171
***
2
15354.152606313226
***
1
###
0
2.801853802412378
***
1
13.560830839877958
***
2
38765.32396749267
***
0
###
0
873.727882603664
***
1
3.7807299715224554
***
2
9578.622147917602
***
1
###
0
50091.86128999012
***
1
5.5657393001246005
***
2
6

In [161]:
cols = df.columns.tolist()
trg = 52/32
cnts = df[['cls', cols[0]]].groupby('cls').count().reset_index()
cnts['delta'] = cnts[cols[0]].max() // trg **-1 - cnts[cols[0]]
print(cnts['delta'])
print(cnts['delta'])
# NOTE: let's create a list of categorical columns so we can normalize
# them to 0 and 1 values after synthesizing new data

# let's see how many unique values each column has
cat_columns = [len(df[col].unique()) for col in df.columns]

# now let's assume that columns with only two unique values
# are categorical - typically true for one-hot encoded datasets
cat_columns =[0 if x > 2 else 1 for x in cat_columns]

# generate a new distribution for each column and class
new_class_arrays = []

for cls in arr_classes:
    dlt = int(cnts[cnts['cls']==cls]['delta'])
    if dlt <=0: continue
        
    desc_df = df[df['cls']==cls].describe()
    
    sub_arr = np.zeros((dlt, len(cols), 1))
    
    col_counter = 0
    for col in cols:
        sub_arr[:, col_counter] = np.random.normal(loc=desc_df[col]['mean'], 
                                                   scale=desc_df[col]['std'], 
                                                   size= (dlt, 1)
                                                  )
        col_counter += 1
    new_class_arrays.append(sub_arr)

new_samples = np.concatenate(new_class_arrays)
np_new_samples = np.array(new_samples)
#print("##")
#print(np_new_samples.shape)
#print(type(new_samples))
#print(new_samples)


newarr = new_samples.reshape(new_samples.shape[0], (new_samples.shape[1]*new_samples.shape[2]))

print(newarr.shape)

#data = [i for j in new_samples for i in j]
#df_new_samples = pd.DataFrame.from_records(data)



#df_new_samples=pd.DataFrame(np_new_samples)
#df_new_samples.to_csv('out.csv')

df=pd.DataFrame(newarr)
df.to_csv('out1.csv')

# now that we have our new samples let's convert the purturbed columns back
# to categorical columns using the list that we have created above

c_idx = 0
for c in cat_columns:
    if c == 1:
        # get slices, then update them which updates the parent array
        s = new_samples[:, c_idx, :]
        s[s < .5] = 0
        s[s >=.5] = 1
    c_idx +=1

0    35.0
1    19.0
2    49.0
Name: delta, dtype: float64
0    35.0
1    19.0
2    49.0
Name: delta, dtype: float64
(103, 35)


In [162]:
x = new_samples[:, :-1]
labels = new_samples[:, -1:]
results = np.zeros(x.shape[0])
print(labels)

for i in range (x.shape[0]):
    for cls in arr_classes:
        mu, isig = inv_sig[cls]
        mh[cls] = mahalanobis(x[i], mu, isig)
       
    # if gmm assignment is not the same as the original label then either discard or re-assign
    # we are going to re-assign the class
    if np.argmin(mh) != labels[i]:
        labels[i] = np.argmin(mh)
        
#then concat labels and new samples, turn into a dataframe, concat with original and train.
labels

[[[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[0.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[1.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]

 [[2.]]]


array([[[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[0.]],

       [[1.]],

       [[0.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[0.]],

       [[0.]],

       [[1.]],

       [[1.]],

       [[0.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[1.]],

       [[2.]],

       [[2.]],

       [[2.]],

       [[2.]],

       [[2.]],

       [[2.]],

       [[2.]],

       [[2.]],

       [

In [130]:


cols = df.columns.tolist()
trg = .4
cnts = df[['cls', cols[0]]].groupby('cls').count().reset_index()
cnts



Unnamed: 0,cls,grp_no
0,0,16
1,1,32
2,2,2


<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f1ff355c0f0>


Student  Amount
1        10        1
         20        1
         30        1
2        20        1
         40        1
         60        1
3        30        1
         60        1
         90        1
4        40        1
         80        1
         120       1
Name: Amount, dtype: int64