In [13]:
import pandas as pd
import numpy as np
from sklearn import neighbors

seed = 10
np.random.seed(seed)

In [17]:
train_features = pd.read_csv('data/train_features.csv')
train_targets = pd.read_csv('data/train_targets_scored.csv')


id_c9edd4732    1
id_95ba069bd    1
id_b39e51fe1    1
id_6a5ff9eca    1
id_db66cbf23    1
               ..
id_41cea8a22    1
id_26fdd51d8    1
id_b46e9cf3a    1
id_a111fe002    1
id_69fd62552    1
Name: sig_id, Length: 23814, dtype: int64

In [21]:
df = train_features
df.reset_index(drop=True, inplace=True)

def preprocess_df(df):
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    return df

df = preprocess(df)

X = df.drop(df.columns[0], axis=1).values
X = X.astype('float32')
y = df.iloc[:, 0].values

In [22]:
def get_tail_label(df: pd.DataFrame, ql=[0.05, 1.]) -> list:
    """
    Find the underrepresented targets.
    Underrepresented targets are those which are observed less than the median occurance.
    Targets beyond a quantile limit are filtered.
    """
    irlbl = df.sum(axis=0)
    print(irlbl.shape)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    print(irlbl.max())
    print(irlbl.shape)
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    print(threshold_irlbl)
    tail_label = irlbl[irlbl > threshold_irlbl].index.tolist()
    print(len(tail_label))
    return tail_label

def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.05, 1.]):
    """
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    tail_labels = get_tail_label(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    print(X_sub.shape, y_sub.shape)
    return X_sub, y_sub

def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
    """
    Give index of 10 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

In [23]:
def adasyn(X, y, beta, K, threshold=1):

    """
    Adaptively generating minority data samples according to their distributions.
    More synthetic data is generated for minority class samples that are harder to learn.
    Harder to learn data is defined as positive examples with not many examples for in their respective neighbourhood.
    Inputs
         -----
         X:  Input features, X, sorted by the minority examples on top.  Minority example should also be labeled as 1
         y:  Labels, with minority example labeled as 1
      beta:  Degree of imbalance desired.  Neg:Pos. A 1 means the positive and negative examples are perfectly balanced.
         K:  Amount of neighbours to look at
 threshold:  Amount of imbalance rebalance required for algorithm
    Variables
         -----
         xi:  Minority example
        xzi:  A minority example inside the neighbourhood of xi
         ms:  Amount of data in minority class
         ml:  Amount of data in majority class
        clf:  k-NN classifier model
          d:  Ratio of minority : majority
       beta:  Degree of imbalance desired
          G:  Amount of data to generate
         Ri:  Ratio of majority data / neighbourhood size.  Larger ratio means the neighbourhood is harder to learn,
              thus generating more data.
     Minority_per_xi:  All the minority data's index by neighbourhood
     Rhat_i:  Normalized Ri, where sum = 1
         Gi:  Amount of data to generate per neighbourhood (indexed by neighbourhoods corresponding to xi)
    Returns
         -----
  syn_data:  New synthetic minority data created
    """

    ms = int(sum(y))
    ml = len(y) - ms

    clf = neighbors.KNeighborsClassifier()
    clf.fit(X, y)

    # Step 1, calculate the degree of class imbalance.  If degree of class imbalance is violated, continue.
    d = np.divide(ms, ml)

    if d > threshold:
        return print("The data set is not imbalanced enough.")

    # Step 2a, if the minority data set is below the maximum tolerated threshold, generate data.
    # Beta is the desired balance level parameter.  Beta > 1 means u want more of the imbalanced type, vice versa.
    G = (ml - ms) * beta

    # Step 2b, find the K nearest neighbours of each minority class example in euclidean distance.
    # Find the ratio ri = majority_class in neighbourhood / K
    Ri = []
    Minority_per_xi = []
    for i in range(ms):
        xi = X[i, :].reshape(1, -1)
        # Returns indices of the closest neighbours, and return it as a list
        neighbours = clf.kneighbors(xi, n_neighbors=K, return_distance=False)[0]
        # Skip classifying itself as one of its own neighbours
        # neighbours = neighbours[1:]

        # Count how many belongs to the majority class
        count = 0
        for value in neighbours:
            if value > ms:
                count += 1

        Ri.append(count / K)

        # Find all the minority examples
        minority = []
        for value in neighbours:
            # Shifted back 1 because indices start at 0
            if value <= ms - 1:
                minority.append(value)

        Minority_per_xi.append(minority)

    # Step 2c, normalize ri's so their sum equals to 1
    Rhat_i = []
    for ri in Ri:
        rhat_i = ri / sum(Ri)
        Rhat_i.append(rhat_i)

    assert(sum(Rhat_i) > 0.99)

    # Step 2d, calculate the number of synthetic data examples that will be generated for each minority example
    Gi = []
    for rhat_i in Rhat_i:
        gi = round(rhat_i * G)
        Gi.append(int(gi))

    # # Step 2e, generate synthetic examples
    syn_data = []
    for i in range(ms):
        xi = X[i, :].reshape(1, -1)
        for j in range(Gi[i]):
            # If the minority list is not empty
            if Minority_per_xi[i]:
                index = np.random.choice(Minority_per_xi[i])
                xzi = X[index, :].reshape(1, -1)
                si = xi + (xzi - xi) * np.random.uniform(0, 1)
                syn_data.append(si)

    # Test the new generated data
    test = []
    for values in syn_data:
        a = clf.predict(values)
        test.append(a)

    print("Using the old classifier, {} out of {} would be classified as minority.".format(np.sum(test), len(syn_data)))

    # Build the data matrix
    data = []
    for values in syn_data:
        data.append(values[0])

    print("{} amount of minority class samples generated".format(len(data)))

    # Concatenate the positive labels with the newly made data
    labels = np.ones([len(data), 1])
    data = np.concatenate([labels, data], axis=1)

    # Concatenate with old data
    org_data = np.concatenate([y.reshape(-1, 1), X], axis=1)
    data = np.concatenate([data, org_data])

    return data, Minority_per_xi, Ri

In [24]:
def SMOLTE_cat_wrapper(x_df, y_df, cat_col, nsamples):
    x_df_up = pd.DataFrame(columns=x_df.columns)
    y_df_up = pd.DataFrame(columns=y_df.columns)

    unique_cat_combs = x_df.groupby(cat_col).size().reset_index().rename(columns={0:'count'})[cat_col]
    num_cols = x_df.columns.drop(cat_col).tolist()
    for index, row in unique_cat_combs.iterrows():
        condition = (x_df[cat_col] == row).all(axis=1)

        subx = x_df[condition][num_cols].reset_index(drop=True)
        suby = y_df[condition].reset_index(drop=True)
        print(subx.shape, suby.shape)

        x_df_sub, y_df_sub = get_minority_samples(subx, suby)
        Syn_data, neighbourhoods, Ri = adasyn(X, y, beta=0.05, K=15, threshold=1)
        np.savetxt(path + 'data/syn_beta_0.05_k_15.csv', Syn_data, delimiter=',')
        return
        a, b = MLSMOTE(x_df_sub, y_df_sub, nsamples, neigh=10)
        cats = pd.concat([row.to_frame().T]*len(a), ignore_index=True)
        a = pd.merge(cats, a, how='left', left_index=True, right_index=True)
        x_df_up = x_df_up.append(a, ignore_index=True)
        y_df_up = y_df_up.append(b, ignore_index=True)
    #y_df_up = y_df_up.astype(int)
    
    print('Number of new samples created: %d' %(len(y_df_up)))
    
    x_df_up = pd.concat([x_df, x_df_up], ignore_index=True)
    y_df_up = pd.concat([y_df, y_df_up], ignore_index=True)
    
    x_df_up = x_df_up.sample(len(x_df_up), random_state=1881).reset_index(drop=True)
    y_df_up = y_df_up.sample(len(y_df_up), random_state=1881).reset_index(drop=True)
    
    x_df_up[cat_col] = x_df_up[cat_col].astype(int)
    return x_df_up, y_df_up

In [25]:
train_features = pd.read_csv('data/train_features.csv')
train_target = pd.read_csv('data/train_targets_scored.csv')
print("Original Train sample size:", train_features.shape, ", Original Train target size:", train_target.shape)

train_features = preprocess_df(train_features)

train_features = train_features.drop('sig_id', axis=1)
train_target = train_target.drop('sig_id', axis=1)

cat_col = ['cp_time', 'cp_dose']
x_train_fold, y_train_fold = SMOLTE_cat_wrapper(train_features, train_target, cat_col, nsamples=50)

Original Train sample size: (23814, 876) , Original Train target size: (23814, 207)
(3886, 873) (3886, 206)
(206,)
122
(183,)
15.25
89
(377, 873) (377, 206)


TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [16]:
Syn_data, neighbourhoods, Ri = adasyn(X, y, beta=0.05, K=15, threshold=1)
np.savetxt(path + 'data/syn_beta_0.05_k_15.csv', Syn_data, delimiter=',')

TypeError: unsupported operand type(s) for +: 'int' and 'str'