In [1]:
import pandas as pd
from sklearn.utils import resample


In [5]:
df=pd.DataFrame({
    'Age':[22,25,27,28,30,35,40,45,50,55,60,65,70],
    'Income':[2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
    'Class':['High','Low','Low','High','High','Low','High','High','Low','Low','High','High','Low']
})

In [6]:
df_high=df[df['Class']=='High']
df_low=df[df['Class']=='Low']

In [9]:
down_sample=resample(df_high,replace=False,n_samples=len(df_low),random_state=42)
print(down_sample)

    Age  Income Class
0    22    2000  High
3    28    3200  High
10   60    5000  High
4    30    3500  High
7    45    4200  High
6    40    4000  High


In [8]:
df_balanced=pd.concat([down_sample,df_low])

In [21]:
print(df_balanced)
print(df_balanced['Class'].value_counts())

    Age  Income Class
0    22    2000  High
3    28    3200  High
10   60    5000  High
4    30    3500  High
7    45    4200  High
6    40    4000  High
1    25    2500   Low
2    27    2700   Low
5    35    3800   Low
8    50    4300   Low
9    55    4500   Low
12   70    6000   Low
Class
High    6
Low     6
Name: count, dtype: int64


In [16]:
up_sample=resample(df_low,replace=True,n_samples=len(df_high),random_state=42)
print(up_sample)

   Age  Income Class
8   50    4300   Low
9   55    4500   Low
5   35    3800   Low
9   55    4500   Low
9   55    4500   Low
2   27    2700   Low
5   35    3800   Low


In [20]:
df_bal=pd.concat([up_sample,df_high])
print(df_bal)
print(df_bal['Class'].value_counts())

    Age  Income Class
8    50    4300   Low
9    55    4500   Low
5    35    3800   Low
9    55    4500   Low
9    55    4500   Low
2    27    2700   Low
5    35    3800   Low
0    22    2000  High
3    28    3200  High
4    30    3500  High
6    40    4000  High
7    45    4200  High
10   60    5000  High
11   65    5500  High
Class
Low     7
High    7
Name: count, dtype: int64


 # SMOTE(Synthetic Minority Over-sampling Technique)

1.SMOTE to generate synthetic samples instead of duplicating existing ones

2.Convert categorical class labels into numeric form for SMOTE to work

3.Apply SMOTE to balance the dataset 

4.Convert back to original categorical values

5.Combine the resampled data into final balanced dataset

In [24]:
!pip install imblearn




[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
pip install --upgrade pip

Collecting pip
  Downloading pip-25.0-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ---------------------------------- ----- 1.6/1.8 MB 6.5 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 6.3 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed pip-25.0
Note: you may need to restart the kernel to use updated packages.


In [29]:
import pandas as pd
from imblearn.over_sampling import SMOTE

In [36]:
sm_df = pd.DataFrame({
   'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
   'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
   'Class': ['min', 'maj', 'maj', 'maj', 'maj', 'min', 'min', 'min', 'maj', 'maj', 'maj', 'maj', 'maj']
})

sm_df['Class'] = sm_df['Class'].map({'min': 0, 'maj': 1})

# Separate the features (X) and target (y)
X = sm_df[['Age', 'Income']]  # Features
y = sm_df['Class']            # Target

# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)

# Apply SMOTE to resample the data
X_sampled, y_sampled = smote.fit_resample(X, y)

# Map the resampled y values back to 'min' and 'maj'
y_sampled = y_sampled.map({0: 'min', 1: 'maj'})

# Concatenate the resampled features and target back into a DataFrame
df_balanced = pd.concat([pd.DataFrame(X_sampled, columns=['Age', 'Income']), pd.DataFrame(y_sampled, columns=['Class'])], axis=1)

# Print the class distribution after SMOTE
print(df_balanced['Class'].value_counts())

# Print the balanced DataFrame
print(df_balanced)


Class
min    9
maj    9
Name: count, dtype: int64
    Age  Income Class
0    22    2000   min
1    25    2500   maj
2    27    2700   maj
3    28    3200   maj
4    30    3500   maj
5    35    3800   min
6    40    4000   min
7    45    4200   min
8    50    4300   maj
9    55    4500   maj
10   60    5000   maj
11   65    5500   maj
12   70    6000   maj
13   40    4031   min
14   35    3831   min
15   44    4176   min
16   35    3826   min
17   41    4040   min
