In [None]:
!pip install imbalanced-learn

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import re
from imblearn.over_sampling import SMOTE

In [None]:
dataset_dir = '../datasets/breast-cancer'

if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

In [None]:
%%bash
cd ../datasets/breast-cancer
rm -f breast-cancer-wisconsin.data
wget https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data

In [None]:
df = pd.read_csv(os.path.join(dataset_dir, 'breast-cancer-wisconsin.data'), header=None, na_values='?',
                              names=[
                                  'id', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape',
                                  'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei',
                                  'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class'
                              ])

In [None]:
del df['id']

In [None]:
# This SMOTE implementation doesn't work with missing values.
df = df.dropna()
X = df[[ c for c in df.columns.values if c != 'class' ]]
y = df['class']
sm = SMOTE(sampling_strategy={2: 5000, 4: 5000}, random_state=1)
X_smoted, y_smoted = sm.fit_resample(X, y)
Xy = np.concatenate([X_smoted, y_smoted.reshape(10000,1)], axis=1)
df = pd.DataFrame(Xy, columns=df.columns.values)

In [None]:
# Shuffle
df = df.sample(frac=1)

In [None]:
df.to_csv(os.path.join(dataset_dir, 'breast-cancer.csv'), index=False)