In [37]:
import pandas as pd
df = pd.read_csv('../Datasets/diabetes_two.csv')

<font color = "green"><h1><b>
Imbalanced Dataset Issue</h1></b>
(i) Undersampling (reducing major class samples to match minor class)<br>
(ii) Oversampling (increasing the samples of minor class)

In [2]:
df['class'].unique() #check unique values in class column

array(['Positive', 'Negative'], dtype=object)

In [6]:
#count of unique values in class
df['class'].nunique()

2

In [7]:
# row count for each unique values
cnt = df.groupby('class').size().reset_index(name='Count')
cnt

Unnamed: 0,class,Count
0,Negative,200
1,Positive,320


<font color = "yellow">
(i) Undersampling

In [8]:
from sklearn.utils import resample

In [17]:
#calculate minor class size
major_class = df[df['class']=='Positive']
minor_class = df[df['class']=='Negative']
n_samples = len(minor_class)
n_samples

200

In [18]:
#update major class size
# replace=False makes sure to take random permutations and no replacements
updated_major_class = resample(major_class, replace=False, n_samples=n_samples, random_state=42)
undersampled_df = pd.concat([minor_class,updated_major_class])

In [19]:
cnt = undersampled_df.groupby('class').size().reset_index(name='Count')
cnt

Unnamed: 0,class,Count
0,Negative,200
1,Positive,200


<font color = "yellow">
(ii) Oversampling

To proceed oversampling, 
first we need to convert categorical data into numerical values
also, we need to handle missing values

In [40]:
#dropping rows with missing value
df.isnull().sum()

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64

In [39]:
#dropping the rows containing null cells
df.dropna(subset=['Age'], inplace=True)
df.dropna(subset=['Gender'], inplace=True)
df.dropna(subset=['weakness'], inplace=True)

In [42]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [43]:
# label encoding
df['Gender'] = le.fit_transform(df['Gender'])
df['Polyuria'] = le.fit_transform(df['Polyuria'])
df['Polydipsia'] = le.fit_transform(df['Polydipsia'])
df['sudden weight loss'] = le.fit_transform(df['sudden weight loss'])
df['weakness'] = le.fit_transform(df['weakness'])
df['Polyphagia'] = le.fit_transform(df['Polyphagia'])
df['Genital thrush'] = le.fit_transform(df['Genital thrush'])
df['visual blurring'] = le.fit_transform(df['visual blurring'])
df['Itching'] = le.fit_transform(df['Itching'])
df['Irritability'] = le.fit_transform(df['Irritability'])
df['delayed healing'] = le.fit_transform(df['delayed healing'])
df['partial paresis'] = le.fit_transform(df['partial paresis'])
df['muscle stiffness'] = le.fit_transform(df['muscle stiffness'])
df['Alopecia'] = le.fit_transform(df['Alopecia'])
df['Obesity'] = le.fit_transform(df['Obesity'])
df['class'] = le.fit_transform(df['class'])

now, oversampling will be done using SMOTE

In [49]:
#count of each type of values in class
cnt = df.groupby('class').size().reset_index(name='Count')
cnt  #before resampling

Unnamed: 0,class,Count
0,0,200
1,1,317


In [45]:
from imblearn.over_sampling import SMOTE

In [46]:
X = df.drop(columns='class')
y = df['class']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)

oversampled_df = pd.DataFrame(X_resampled, columns=X.columns)
oversampled_df['class'] = y_resampled

In [51]:
cnt = oversampled_df.groupby('class').size().reset_index(name='Count')
cnt #after resampling

Unnamed: 0,class,Count
0,0,317
1,1,317
