In [1]:
import sys
import os
import numpy as np
import pandas as pd
from src import multivariate_os, nominal_os
from collections import Counter
from sklearn.model_selection import train_test_split

In [8]:
# Multivariate over-sampling
def mndo(pos, num_minority):
    pos, zero_std = multivariate_os.find_zerostd(pos, num_minority)
    pos, no_corr = multivariate_os.no_corr(pos, num_minority)
    pos = multivariate_os.mnd_os(pos, num_minority, zero_std, no_corr)
    
    return pos

In [3]:
# calc euclidean distance
def distance(pos, pos_gen):
    # calc euclidean distance
    distance = []
    for i in range(len(pos_gen)):
        diff = np.array(pos_gen.iloc[i]) - np.array(pos)
        tmp = [np.linalg.norm(diff[j]) for j in range(len(diff))]
        distance.append(tmp)
    del tmp

    df_dist = pd.DataFrame(distance).T
    df_dist.index = pos.index
    
    # find nearest samples (k=5)
    key = []
    for i in range(len(df_dist.columns)):
        sort = np.argsort(df_dist[df_dist.columns[i]])
        key.append(sort[0:5])

    key = np.array(key)
    """rank = []
    for i in range(len(key)):
        rank.append(df_dist.index[key[i]])

    rank = np.array(rank.T)
    """
    
    return key

In [4]:
# extract nearest nominal samples
def nominal_os(key, nominal):
    tmp = []
    nominal_gen = []

    # key.shape[0]
    for i in range(key.shape[0]):
        tmp = [nominal.iloc[key[i][j]] for j in range(key.shape[1])]
        nn_df = pd.DataFrame(tmp)

        tmp = []
        for k in range(nn_df.shape[1]):
            counter = Counter(nn_df[nn_df.columns[k]])
            #print(counter)

            # value, feeq
            value, freq = counter.most_common(1)[0]
            tmp.append(value)

        nominal_gen.append(tmp)
        #print(value, freq)
    
    return pd.DataFrame(nominal_gen)

In [5]:
# Load dataset
try:
    data = pd.read_csv('Predataset/abalone_19/continuous.csv')
    nominal = pd.read_csv('Predataset/abalone_19/nominal.csv')
    #data = pd.read_csv('Predataset/{}/continuous.csv'.format(sys.argv[1]))
    #nominal = pd.read_csv('Predataset/{}/nominal.csv'.format(sys.argv[1]))
    #file_name = sys.argv[1]
except IndexError:
    sys.exit('error: Must specify dataset file')
except FileNotFoundError:
    sys.exit('error: No such file or directory')

In [6]:
# split the data
X = data.drop('Label', axis=1)
y = data.Label
X = np.array(X)
y = np.array(y)

# split positive class
pos = data[data.Label == 1]
pos = pos.drop('Label', axis=1)
nominal = nominal[nominal.Label == 1]
nominal = nominal.drop('Label', axis=1)


# calc number of  samples to synthesize
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
        shuffle=True, random_state=42)

cnt = Counter(y_train)
num_minority = int((cnt[-1] - cnt[1]))

In [9]:
# over-sampling
pos_gen = mndo(pos, num_minority)

Searching no correlation:  10%|█         | 10/100 [00:00<00:00, 94.72it/s]

Not found zero std.


Multi normal dist over-sampling:   1%|          | 1/100 [00:00<00:15,  6.31it/s]

Not found no correlation.




In [10]:
key = distance(pos, pos_gen)
nominal_gen = nominal_os(key, nominal)

In [11]:
print(nominal_gen.shape)
print(pos_gen.shape)

(2875, 3)
(2875, 7)


In [17]:
pd.concat([pos_gen, nominal_gen], axis=1)

Unnamed: 0,3,4,5,6,7,8,9,0,1,2
0,0.610520,0.406953,0.169064,1.451173,0.263414,0.165909,0.326260,1.0,0.0,0.0
1,0.588489,0.496700,0.192935,1.919198,0.463415,0.285538,0.227183,1.0,0.0,0.0
2,0.755533,0.501587,0.172132,1.062805,0.145460,0.260071,0.423490,0.0,0.0,1.0
3,0.532379,0.433247,0.192943,1.044674,0.531064,0.239063,0.427487,1.0,0.0,0.0
4,0.488770,0.486348,0.170849,0.664584,0.582732,0.122482,0.276180,0.0,0.0,1.0
5,0.413272,0.482966,0.201726,0.887276,0.749650,0.324945,0.387428,1.0,0.0,0.0
6,0.607876,0.558336,0.145926,0.539166,0.221285,0.162369,0.272385,0.0,0.0,0.0
7,0.587054,0.493006,0.174792,0.832885,0.343009,0.140704,0.430448,1.0,0.0,0.0
8,0.591796,0.430955,0.157513,0.614941,0.518965,0.075951,0.364243,0.0,0.0,0.0
9,0.546089,0.549018,0.156138,1.872302,0.560541,0.070153,0.315253,1.0,0.0,0.0


In [14]:
pos.head()

Unnamed: 0,3,4,5,6,7,8,9
9,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32
33,0.68,0.55,0.175,1.798,0.815,0.3925,0.455
128,0.7,0.535,0.16,1.7255,0.63,0.2635,0.54
256,0.56,0.45,0.185,1.07,0.3805,0.175,0.41
293,0.565,0.455,0.175,1.013,0.342,0.207,0.35
