In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [13]:
df = pd.read_csv("orthopedi_biomechanical.csv")

In [14]:
df

Unnamed: 0,pelvic_incidence,pelvic_tilt numeric,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,0.128992,0.498719,-6.811736e-01,-0.192038,-1.446647e+00,-0.708059,0
1,-1.277491,-0.759197,-1.473505e+00,-1.069354,-2.629694e-01,-0.579556,0
2,0.469551,0.465075,-1.120209e-01,0.277089,-8.964926e-01,-0.795421,0
3,0.496834,0.710221,-4.258836e-01,0.126575,-1.206219e+00,-0.402288,0
4,-0.652259,-0.800376,-1.294229e+00,-0.223713,-7.322044e-01,-0.490106,0
5,-1.207477,-0.370399,-1.467556e+00,-1.273234,9.349989e-01,-0.641789,0
6,-0.433985,-0.174795,-8.138202e-01,-0.414181,2.006527e-01,-0.541574,0
7,-0.907265,-0.689248,-1.255087e+00,-0.640208,-4.744026e-02,-0.985977,0
8,-0.999769,-0.409487,-5.138605e-01,-0.973021,5.343598e-01,-0.346886,0
9,-1.416585,-1.267749,-5.541490e-01,-0.864570,-2.532440e+00,-0.683556,0


In [4]:
X = df.iloc[:,:6]
y = df.iloc[:,6:7]

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=69)

In [5]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [6]:
len(df_train), len(df_test)

(613, 154)

In [7]:
sns.countplot(x="class", data=df_train)
plt.title("train")

ValueError: Could not interpret input 'class'

In [None]:
sns.countplot(x="class", data=df_test)
plt.title("test")

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.neighbors import KNeighborsClassifier

class SMOTE(BaseEstimator, TransformerMixin):
    def __init__(self, downsampling_percentage, upsampling_percentage, k):
        #self.predictor_idx = predictor_idx
        #self.target_idx = target_idx
        self.downsampling_percentage = downsampling_percentage
        self.upsampling_percentage = upsampling_percentage
        self.k = k
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        #set majority and minority
        #set the "class" according to the dataset
        majority = np.where(X["class"].values==0)[0]
        minority = np.where(X["class"].values==1)[0]
        #downsample the majority
        downsampling_multiplier = (100 + self.downsampling_percentage) / 100
        majority = np.random.permutation(majority)[:int(np.floor(len(majority)*1/downsampling_multiplier))]
        #setting the upsampling
        upsampling_multiplier = int(np.floor(100 + self.upsampling_percentage) / 100)
        majority_df = X.iloc[majority]
        minority_df = X.iloc[minority]
        #set the index according to the dataset
        minority_y = minority_df.iloc[:,6:]
        minority_x = minority_df.iloc[:,:6]
        # fit
        knn = KNeighborsClassifier(n_neighbors=self.k)
        knn.fit(minority_x.values, minority_y.values.ravel())
        # create matrix of nearest neighbours
        nn = knn.kneighbors(minority_x, (self.k)+1, return_distance=False)[:,1:]
        # shuffle the whole matrix
        np.random.shuffle(nn.T)
        # only one neighbour
        nn = nn[:,0]
        og = np.arange(len(nn))
        diff = (minority_x.iloc[og].values - minority_x.iloc[nn].values)[:,:minority_x.shape[1]]
        # gap must be 3 dimensional arrays to generalize well
        # beware of dimensions!!!
        gap = np.random.random_sample((len(nn),int(np.floor(upsampling_multiplier-1))))
        gap = gap.reshape(upsampling_multiplier-1,len(gap),1)
        smoted = minority_x.iloc[og].values + (gap*diff)
        smoted = smoted.reshape((upsampling_multiplier-1)*len(nn),smoted.shape[2])
        smoted = np.c_[smoted, np.ones(smoted.shape[0])]
        smoted = np.r_[majority_df.values, minority_df.values, smoted]
        smoted = pd.DataFrame(smoted, columns=list(X))
        return smoted

In [None]:
smote = SMOTE(100, 200, 5)
smoted = smote.fit_transform(df_train)

In [None]:
smoted.shape

In [None]:
smoted["class"].unique()

In [None]:
sns.countplot(x="class", data=smoted)