# Oversampling Using SMOTE

### Importing the Imbalanced voice Dataset using pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('voice.csv')
data.describe()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
count,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0
mean,0.172099,0.062748,0.17759,0.123172,0.222827,0.099655,3.324734,45.171767,0.909109,0.45656,0.151364,0.172099,0.13053,0.03289,0.254299,0.681083,0.047889,4.139251,4.091362,0.180522
std,0.026608,0.014323,0.036343,0.041512,0.024119,0.037698,4.86574,155.025254,0.038629,0.164528,0.082264,0.026608,0.029403,0.017885,0.032611,0.440952,0.056839,2.934907,2.930289,0.120737
min,0.039363,0.018363,0.010975,0.000229,0.042946,0.014922,0.141735,2.068455,0.738651,0.036876,0.0,0.039363,0.055565,0.010953,0.103093,0.007812,0.004883,0.007812,0.0,0.0
25%,0.157326,0.057558,0.159778,0.101699,0.207477,0.083749,1.553479,5.391918,0.890489,0.326409,0.099033,0.157326,0.109993,0.017167,0.246154,0.325081,0.007812,0.867188,0.828125,0.103587
50%,0.178613,0.061391,0.183704,0.128897,0.223308,0.106105,2.101029,8.038247,0.913988,0.448051,0.172603,0.178613,0.125527,0.026144,0.266667,0.628773,0.023438,4.359375,4.328125,0.150062
75%,0.190825,0.071732,0.201442,0.147393,0.241845,0.118495,2.911595,13.712721,0.935307,0.575841,0.21588,0.190825,0.149093,0.047384,0.275862,0.968183,0.070312,6.140625,6.070312,0.227048
max,0.240844,0.115273,0.257417,0.242124,0.268924,0.252225,34.725453,1309.612887,0.981997,0.842936,0.28,0.240844,0.229153,0.168421,0.27907,2.805246,0.458984,21.867188,21.84375,0.932374


In [2]:
#Calculating the counts of unique values for Target
data['label'].value_counts() 

male      1584
female     607
Name: label, dtype: int64

# Finding whether the data is Imbalanced or Not

In [3]:
#Calculating Ratio of the imbalanced dataset
def findImbalancedRatio(data, target, threshold): 
    maxPercentage = (data[target].value_counts().max()/data[target].count())*100
    minPercentage = (data[target].value_counts().min()/data[target].count())*100
    if minPercentage < threshold * 100 :
        print("Dataset is Imbalanced")
    else:
        print("Dataset is Balanced")
    return maxPercentage, minPercentage

In [4]:
target = 'label'
threshold = 0.4
maxPercentSample, minPercentSample = findImbalancedRatio(data, target, threshold)
print('Minority Sample percentage = {}'.format(minPercentSample))
print('Majority Sample percentage = {}'.format(maxPercentSample))

Dataset is Imbalanced
Minority Sample percentage = 27.704244637151987
Majority Sample percentage = 72.29575536284801


# Smote Function

In [5]:
def makeOverSamplesSMOTE(data,target, threshold):
 #input DataFrame
 #X →Independent Variable in DataFrame\
 #y →dependent Variable in Pandas DataFrame format
    X = data.iloc[:, data.columns != target]
    y = data.iloc[:, data.columns == target]
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(ratio = threshold)
    X, y = sm.fit_sample(X, y)
    return X,y

In [6]:
target = 'label'
threshold = 0.4 #Should be greater than current ratio of the dataset. will give error if less than the current
X,y = makeOverSamplesSMOTE(data,target, threshold)
print('Shape of X after Oversampling: {}'.format(X.shape))
print('Shape of y after Oversampling: {}'.format(y.shape))

Shape of X after Oversampling: (2217, 20)
Shape of y after Oversampling: (2217,)


  y = column_or_1d(y, warn=True)


In [7]:
#Before Sampling
print("Before OverSampling, counts of label 'male': {}".format(sum(data['label']== 'male')))
print("Before OverSampling, counts of label 'female': {}".format(sum(data['label']=='female')))
# After UpSampling
print()
print("After OverSampling, counts of label 'male': {}".format(sum(y=='male')))
print("After OverSampling, counts of label 'female': {}".format(sum(y=='female')))

Before OverSampling, counts of label 'male': 1584
Before OverSampling, counts of label 'female': 607

After OverSampling, counts of label 'male': 1584
After OverSampling, counts of label 'female': 633


# Merging and Creating the new DataFrame Df2 after oversampling

In [8]:
#Merging the 2 arrays
result = np.column_stack([X, y])
result

array([[0.059780985, 0.064241268, 0.032026913, ..., 0.0, 0.0, 'male'],
       [0.06600874, 0.06731002900000001, 0.040228735, ..., 0.046875,
        0.052631579000000005, 'male'],
       [0.077315503, 0.083829421, 0.036718459, ..., 0.0078125,
        0.046511628, 'male'],
       ...,
       [0.15244193217350627, 0.09300094072987096, 0.18211238575090838,
        ..., 3.1972140629746764, 0.21754402799477263, 'female'],
       [0.18562736389458262, 0.027261550657836822, 0.18668711563706408,
        ..., 6.383409136986466, 0.2755404674970891, 'female'],
       [0.18711609325172662, 0.06287357776274387, 0.19858536403118088,
        ..., 0.533171741010939, 0.20881740998143822, 'female']],
      dtype=object)

In [10]:
#saving the column names from data in a list
cols = list(data.columns.values)
# printing column names from dataframe
print(cols)
df2 = pd.DataFrame(result)
#assigning columns names to new dataframe df2
df2.columns = cols
df2.tail()

['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt', 'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun', 'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx', 'label']


Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
2212,0.187861,0.0324249,0.18798,0.177076,0.202824,0.0257489,2.62297,9.75612,0.818562,0.224389,...,0.187861,0.179782,0.0496844,0.231061,0.19529,0.162011,0.232323,0.0703125,0.308114,female
2213,0.186013,0.0734245,0.20949,0.168964,0.235562,0.0665978,2.48019,11.9036,0.913732,0.529413,...,0.186013,0.168028,0.0163306,0.250154,0.262669,0.0241115,0.690216,0.666105,0.238324,female
2214,0.152442,0.0930009,0.182112,0.0513853,0.239062,0.187677,1.36691,4.95483,0.955272,0.700534,...,0.152442,0.192813,0.0225409,0.267979,0.486155,0.00965425,3.20687,3.19721,0.217544,female
2215,0.185627,0.0272616,0.186687,0.172317,0.200213,0.0278965,2.43633,8.34824,0.828209,0.170936,...,0.185627,0.169275,0.0170264,0.235604,1.1216,0.162869,6.54628,6.38341,0.27554,female
2216,0.187116,0.0628736,0.198585,0.17528,0.230993,0.0557124,1.95311,7.15036,0.920432,0.516158,...,0.187116,0.170247,0.0212836,0.262295,0.227338,0.0429562,0.576128,0.533172,0.208817,female


# Loading the new dataset df2 to H2O dataframe

In [11]:
import h2o
import random, os, sys
from datetime import datetime
import pandas as pd
import logging
import csv
import optparse
import time
import json
from distutils.util import strtobool
import psutil

In [12]:
import h2o
from h2o.automl import H2OAutoML
import random, os, sys
from datetime import datetime
import pandas as pd
import logging
import csv
import optparse
import time
import json
from distutils.util import strtobool
import psutil
import numpy as np

In [13]:
data_path=None
target= 'label'
rid=None
server_path=None
min_mem_size=6 
independent_variables=None
run_time=360
classification=True
scale=False
max_models=9    
balance_y=False 
balance_threshold=0.2
levels_thresh=33
level_set=[]
analysis=1 # 0 - none, 1- binary, 2 - multi-class, 3 - regression

In [14]:
pct_memory=0.5
virtual_memory=psutil.virtual_memory()
min_mem_size=int(round(int(pct_memory*virtual_memory.available)/1073741824,0))
print(min_mem_size)

2


In [15]:
import h2o
port_no=random.randint(5555,55555)
try:
    h2o.init(strict_version_check=False,min_mem_size_GB=min_mem_size,port=port_no) # start h2o
except:
  logging.critical('h2o.init')
  h2o.download_all_logs(dirname=logs_path, filename=logfile)      
  h2o.cluster().shutdown()
  sys.exit(2)
df = h2o.H2OFrame(df2)

Checking whether there is an H2O instance running at http://localhost:19010..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.201-b09, mixed mode)
  Starting server from C:\Users\Balaji\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Balaji\AppData\Local\Temp\tmpaykgjdpd
  JVM stdout: C:\Users\Balaji\AppData\Local\Temp\tmpaykgjdpd\h2o_Balaji_started_from_python.out
  JVM stderr: C:\Users\Balaji\AppData\Local\Temp\tmpaykgjdpd\h2o_Balaji_started_from_python.err
  Server is running at http://127.0.0.1:19010
Connecting to H2O server at http://127.0.0.1:19010... successful.


0,1
H2O cluster uptime:,07 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.2
H2O cluster version age:,1 month and 16 days
H2O cluster name:,H2O_from_python_Balaji_ekmb46
H2O cluster total nodes:,1
H2O cluster free memory:,1.917 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [16]:
df.describe()

Rows:2217
Cols:21




Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
type,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,enum
mins,0.039363342999999995,0.018363242,0.010974576000000002,0.000228758,0.042946274000000006,0.014922481,0.141735424,2.068455491,0.738650686,0.036876475,0.0,0.039363342999999995,0.05556534900000001,0.010952903,0.103092784,0.0078125,0.004882813,0.0078125,0.0,0.0,
mean,0.1721463952310537,0.06263388006249444,0.1776711850886157,0.1234469857310947,0.22269258635181927,0.0992456006255627,3.3191843521488664,44.867770255664375,0.9088278663708543,0.4559664436094676,0.15153075860267107,0.1721463952310537,0.13100339009645276,0.032803347734271064,0.25429736024442007,0.6796771616399276,0.048193314674197414,4.1367408083125685,4.088547493655504,0.1808500789080372,
maxs,0.240844029,0.115273247,0.25741705,0.242123532,0.268924051,0.252225201,34.72545327,1309.612887,0.981996589,0.842935931,0.28,0.240844029,0.22915253,0.168421053,0.27906976699999997,2.805245536,0.458984375,21.8671875,21.84375,0.932374101,
sigma,0.026538264308705678,0.014458970114661386,0.03619148578718896,0.04161875495776924,0.02406404155373762,0.0380216594134419,4.84059040138736,154.1804912804971,0.03883653084941137,0.16495262747371725,0.08202646201658886,0.026538264308705678,0.02959092507319634,0.017886645209025026,0.0324716256464505,0.43994130954553534,0.05694550886815284,2.933246399557713,2.928343290403696,0.12034606463534253,
zeros,0,0,0,0,0,0,0,0,0,0,218,0,0,0,0,0,0,0,62,62,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0.059780985,0.064241268,0.032026913,0.015071489,0.09019344,0.07512195099999999,12.86346184,274.40290550000003,0.893369417,0.491917766,0.0,0.059780985,0.08427910599999999,0.015701668000000002,0.275862069,0.0078125,0.0078125,0.0078125,0.0,0.0,male
1,0.06600874,0.06731002900000001,0.040228735,0.019413867,0.09266619,0.073252323,22.42328536,634.6138545,0.892193242,0.513723843,0.0,0.06600874,0.107936554,0.015825915,0.25,0.009014423,0.0078125,0.0546875,0.046875,0.052631579000000005,male
2,0.077315503,0.083829421,0.036718459,0.008701057,0.131908017,0.123206961,30.75715458,1024.927705,0.846389092,0.478904979,0.0,0.077315503,0.098706262,0.015655577,0.271186441,0.007990057,0.0078125,0.015625,0.0078125,0.046511628,male
