In [22]:
import pandas as pd
from sklearn import datasets
% matplotlib inline
from matplotlib import pyplot as plt
import numpy as np

# Loading dataset

In [3]:
# load breast cancer dataset
breast_cancer = datasets.load_breast_cancer()

In [8]:
# load breas cancer features to pandas DataFrame
breast_cancer_df = pd.DataFrame(breast_cancer.data)
breast_cancer_df.columns = breast_cancer.feature_names
breast_cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [12]:
# appended label column to dataset
breast_cancer_df['type'] = breast_cancer.target
breast_cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


# Counting number of examples related to each type 

In [23]:
type_0 = breast_cancer_df.loc[breast_cancer_df['type'] == 0]
type_1 = breast_cancer_df.loc[breast_cancer_df['type'] == 1]
print 'Type 0 number of examples:', type_0.shape[0]
print 'Type 1 number of examples:', type_1.shape[0]

Type 0 number of examples: 212
Type 1 number of examples: 357


# Making artificial imbalanced dataset
 $N(type_0) = 0.05 \times N(type_1)$, where $N(y)$ - number of examples related to class $y$

In [100]:
num_type_0 = type_0.shape[0]
num_type_1 = type_1.shape[0]
num_type_0_imbalanced = int(0.05 * num_type_1)
print 'Imbalanced type 0 number of samples: ', num_type_0_imbalanced

Imbalanced type 0 number of samples:  17


### Randomly choose $type_0$ subsample for imbalanced dataset

In [109]:
np.random.seed(0)
inds_set = {int(num_type_0 * np.random.random())}
while len(inds_set) < num_type_0_imbalanced:
    inds_set.add(int(num_type_0 * np.random.random()))

inds_list = list(inds_set)
print 'Random indexes numbers:', inds_list

Random indexes numbers: [196, 167, 136, 204, 15, 112, 81, 18, 115, 116, 151, 120, 89, 92, 189, 4, 127]


In [108]:
type_0_imbalanced = type_0.iloc[inds_list]
type_0_imbalanced

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
503,23.09,19.83,152.1,1682.0,0.09342,0.1275,0.1676,0.1003,0.1505,0.05484,...,23.87,211.5,2782.0,0.1199,0.3625,0.3794,0.2264,0.2908,0.07277,0
373,20.64,17.35,134.8,1335.0,0.09446,0.1076,0.1527,0.08941,0.1571,0.05478,...,23.17,166.8,1946.0,0.1562,0.3055,0.4159,0.2112,0.2689,0.07055,0
263,15.61,19.38,100.0,758.6,0.0784,0.05616,0.04209,0.02847,0.1547,0.05443,...,31.67,115.9,988.6,0.1084,0.1807,0.226,0.08568,0.2683,0.06829,0
535,20.55,20.86,137.8,1308.0,0.1046,0.1739,0.2085,0.1322,0.2127,0.06251,...,25.48,160.2,1809.0,0.1268,0.3135,0.4433,0.2148,0.3077,0.07569,0
15,14.54,27.54,96.73,658.8,0.1139,0.1595,0.1639,0.07364,0.2303,0.07077,...,37.13,124.1,943.2,0.1678,0.6577,0.7026,0.1712,0.4218,0.1341,0
214,14.19,23.81,92.87,610.7,0.09463,0.1306,0.1115,0.06462,0.2235,0.06433,...,34.85,115.0,811.3,0.1559,0.4059,0.3744,0.1772,0.4724,0.1026,0
141,16.11,18.05,105.1,813.0,0.09721,0.1137,0.09447,0.05943,0.1861,0.06248,...,25.27,129.0,1233.0,0.1314,0.2236,0.2802,0.1216,0.2792,0.08158,0
18,19.81,22.15,130.0,1260.0,0.09831,0.1027,0.1479,0.09498,0.1582,0.05395,...,30.88,186.8,2398.0,0.1512,0.315,0.5372,0.2388,0.2768,0.07615,0
219,19.53,32.47,128.0,1223.0,0.0842,0.113,0.1145,0.06637,0.1428,0.05313,...,45.41,180.2,2477.0,0.1408,0.4097,0.3995,0.1625,0.2713,0.07568,0
223,15.75,20.25,102.6,761.3,0.1025,0.1204,0.1147,0.06462,0.1935,0.06303,...,30.29,125.9,1088.0,0.1552,0.448,0.3976,0.1479,0.3993,0.1064,0


### Merge $type_1$ examples with randomly chosen $type_0$ subsample and shuffle

In [119]:
imbalanced_df = type_1.append(type_0_imbalanced)
imbalanced_df = imbalanced_df.sample(frac=1, random_state=0)
imbalanced_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
246,13.200,17.43,84.13,541.6,0.07215,0.04524,0.043360,0.011050,0.1487,0.05635,...,27.82,88.28,602.0,0.11010,0.15080,0.229800,0.04970,0.2767,0.07198,1
485,12.450,16.41,82.85,476.7,0.09514,0.15110,0.154400,0.048460,0.2082,0.07325,...,21.03,97.82,580.6,0.11750,0.40610,0.489600,0.13420,0.3231,0.10340,1
508,16.300,15.70,104.70,819.8,0.09427,0.06712,0.055260,0.045630,0.1711,0.05657,...,17.76,109.80,928.2,0.13540,0.13610,0.194700,0.13570,0.2300,0.07230,1
378,13.660,15.15,88.27,580.6,0.08268,0.07548,0.042490,0.024710,0.1792,0.05897,...,19.64,97.96,657.0,0.12750,0.31040,0.256900,0.10540,0.3387,0.09638,1
367,12.210,18.02,78.31,458.4,0.09231,0.07175,0.043920,0.020270,0.1695,0.05916,...,24.04,93.85,624.6,0.13680,0.21700,0.241300,0.08829,0.3218,0.07470,1
216,11.890,18.35,77.32,432.2,0.09363,0.11540,0.066360,0.031420,0.1967,0.06314,...,27.10,86.20,531.2,0.14050,0.30460,0.280600,0.11380,0.3397,0.08365,1
426,10.480,14.98,67.49,333.6,0.09816,0.10130,0.063350,0.022180,0.1925,0.06915,...,21.57,81.41,440.4,0.13270,0.29960,0.293900,0.09310,0.3020,0.09646,1
290,14.410,19.73,96.03,651.0,0.08757,0.16760,0.136200,0.066020,0.1714,0.07192,...,22.13,101.70,767.3,0.09983,0.24720,0.222000,0.10210,0.2272,0.08799,1
160,11.750,20.18,76.10,419.8,0.10890,0.11410,0.068430,0.037380,0.1993,0.06453,...,26.21,88.91,543.9,0.13580,0.18920,0.195600,0.07909,0.3168,0.07987,1
125,13.850,17.21,88.44,588.7,0.08785,0.06136,0.014200,0.011410,0.1614,0.05890,...,23.58,100.30,725.9,0.11570,0.13500,0.081150,0.05104,0.2364,0.07182,1
