In [39]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE 
from sklearn import preprocessing

In [42]:
df = pd.read_csv("data_details/trialPromoResults.csv")

In [43]:
df = df.drop(["index"], axis=1)
y = df["decision"]
print('Original dataset shape {}'.format(Counter(y)))

Original dataset shape Counter({'None': 828, 'A': 133, 'B': 39})


In [44]:
X.head()

Unnamed: 0,sex,mstatus,age,children,occupation,education,income,avbal,avtrans
0,F,married,56.82,1,legal,secondary,3105.39,33003.48,1776.81
1,M,widowed,87.35,3,retired,tertiary,4874.08,18941.99,863.56
2,M,single,28.75,0,manuf,professional,14232.37,30013.32,3231.14
3,F,married,35.71,0,education,postgrad,3214.93,15423.24,1996.09
4,M,single,20.53,0,construct,tertiary,3214.93,15423.24,1996.09


In [45]:
le = preprocessing.LabelEncoder()

# transform sex
le.fit(["F", "M"])
df["sex"] = le.transform(df["sex"])

# transform mstatus
le.fit(["divorced", "married", "single", "widowed"])
df["mstatus"] = le.transform(df["mstatus"])

le.fit(["construct", "education", "finance", "government", "IT", "legal", "manuf", "medicine", "retired"])
df["occupation"] = le.transform(df["occupation"])

le.fit(["secondary", "tertiary", "postgrad", "professional"])
df["education"] = le.transform(df["education"])

df.head()

Unnamed: 0,sex,mstatus,age,children,occupation,education,income,avbal,avtrans,decision
0,0,1,56.82,1,5,2,3105.39,33003.48,1776.81,
1,1,3,87.35,3,8,3,4874.08,18941.99,863.56,
2,1,2,28.75,0,6,1,14232.37,30013.32,3231.14,B
3,0,1,35.71,0,2,0,3214.93,15423.24,1996.09,
4,1,2,20.53,0,1,3,3214.93,15423.24,1996.09,


In [47]:
X = df.loc[:, df.columns != "decision"]
X.head()

Unnamed: 0,sex,mstatus,age,children,occupation,education,income,avbal,avtrans
0,0,1,56.82,1,5,2,3105.39,33003.48,1776.81
1,1,3,87.35,3,8,3,4874.08,18941.99,863.56
2,1,2,28.75,0,6,1,14232.37,30013.32,3231.14
3,0,1,35.71,0,2,0,3214.93,15423.24,1996.09
4,1,2,20.53,0,1,3,3214.93,15423.24,1996.09


### Run the SMOTE algorithm to balance the output classes as mentioned [here](http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.over_sampling.SMOTE.html)

In [48]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y)

In [49]:
print('Resampled dataset shape {}'.format(Counter(y_res)))

Resampled dataset shape Counter({'None': 828, 'B': 828, 'A': 828})


### So SMOTE in Python cannot yet handle categorical data - refer [this](https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python)

Employing the workaround mentioned in the above link

In [51]:
X_res

array([[0.00000000e+00, 1.00000000e+00, 5.68200000e+01, ...,
        3.10539000e+03, 3.30034800e+04, 1.77681000e+03],
       [1.00000000e+00, 3.00000000e+00, 8.73500000e+01, ...,
        4.87408000e+03, 1.89419900e+04, 8.63560000e+02],
       [1.00000000e+00, 2.00000000e+00, 2.87500000e+01, ...,
        1.42323700e+04, 3.00133200e+04, 3.23114000e+03],
       ...,
       [2.03701249e-01, 1.00000000e+00, 5.78820503e+01, ...,
        3.39828853e+03, 1.28325956e+04, 1.75215081e+03],
       [0.00000000e+00, 1.00000000e+00, 3.35669645e+01, ...,
        7.20328308e+03, 7.30739274e+04, 4.51064777e+03],
       [1.00000000e+00, 1.00000000e+00, 3.88666578e+01, ...,
        2.04640286e+03, 1.86638747e+04, 5.96466926e+02]])