# SMOTE Balancing

In [1]:
# Independences
import pandas as pd
from imblearn.over_sampling import SMOTE

In [2]:
# Import files
df_train = pd.read_csv('../Datasets/train-train.csv')

In [3]:
print('train_sum:', len(df_train))
print()
print('Before SMOTE Balancing:')
print(df_train['Label'].value_counts())

train_sum: 15030469

Before SMOTE Balancing:
Label
normal         14346045
botnet           614556
botnet_spam       69868
Name: count, dtype: int64


### Balancing

In [4]:
def stage1_label(label):
    return 0 if label == 'normal' else 1

def stage2_label(label):
    return 1 if 'spam' in label else 0

def multiclass_label(label):
    if label == 'normal':
        return 0
    elif 'spam' in label:
        return 2
    else:
        return 1

df_train['Multiclass_Label'] = df_train['Label'].apply(multiclass_label)
x_train = df_train.drop(columns=['Label', 'Multiclass_Label'])
y_train = df_train['Multiclass_Label']

In [5]:
sm = SMOTE() 
x_train_balance, y_train_balance = sm.fit_resample(x_train, y_train)
train_balance = x_train_balance.join(y_train_balance)

In [6]:
def labeling(label):
    if label == 0:
        return 'normal'
    elif label == 1:
        return 'botnet'
    else:
        return 'botnet_spam'

train_balance['Label'] = train_balance['Multiclass_Label'].apply(labeling)
train_balance = train_balance.drop(columns=['Multiclass_Label'])

In [7]:
train_balance.columns

Index(['Dport', 'igmp', 'SrcBytes', 'SrcAddr', 'ipnip', 'unas', 'gre', 'pim',
       'TotPkts', 'tcp', 'rtp', 'TotBytes', 'State', 'llc', 'ipv6-icmp', 'Dur',
       'Sport', 'ipv6', 'udp', 'icmp', 'DstAddr', 'ipx/spx', 'arp', 'rsvp',
       'esp', 'rtcp', 'Dir', 'rarp', 'udt', 'Label'],
      dtype='object')

In [8]:
print('train_sum:', len(train_balance))
print()
print('Before SMOTE Balancing:')
print(train_balance['Label'].value_counts())

train_sum: 43038135

Before SMOTE Balancing:
Label
normal         14346045
botnet         14346045
botnet_spam    14346045
Name: count, dtype: int64


In [9]:
train_balance.to_csv(f'train-smote.csv', index=False)