In [42]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [43]:
df = pd.read_csv("original_data/trialPromoResults.csv")
df = df.drop(["index"], axis=1)

In [44]:

print('Original dataset shape {0}, {1}'.format(df.shape[0], df.shape[1]))

Original dataset shape 1000, 10


In [45]:
df.head()
headers = df.columns.values
headers

array([' sex', ' mstatus', ' age', ' children', ' occupation',
       ' education', ' income', ' avbal', ' avtrans', ' decision'],
      dtype=object)

In [46]:
# strip the spaces in the header
df = df.rename(columns=lambda x: x.strip())
df.columns.values

array(['sex', 'mstatus', 'age', 'children', 'occupation', 'education',
       'income', 'avbal', 'avtrans', 'decision'], dtype=object)

### Perform one-hot encoding

In [50]:
categorical_features = ["sex", "mstatus", "occupation", "education"]
df[categorical_features].head()

Unnamed: 0,sex,mstatus,occupation,education
0,F,married,legal,secondary
1,M,widowed,retired,tertiary
2,M,single,manuf,professional
3,F,married,education,postgrad
4,M,single,construct,tertiary


In [51]:
label_encoder = LabelEncoder()
for categorical_feature in categorical_features:
    df[categorical_feature] = label_encoder.fit_transform(df[categorical_feature])

df.head()

Unnamed: 0,sex,mstatus,age,children,occupation,education,income,avbal,avtrans,decision
0,0,1,56.82,1,5,2,3105.39,33003.48,1776.81,
1,1,3,87.35,3,8,3,4874.08,18941.99,863.56,
2,1,2,28.75,0,6,1,14232.37,30013.32,3231.14,B
3,0,1,35.71,0,2,0,3214.93,15423.24,1996.09,
4,1,2,20.53,0,1,3,3214.93,15423.24,1996.09,


In [52]:
one_hot_encoder = OneHotEncoder()
df_onehotencoded = enc.fit_transform(df[categorical_features])
df_onehotencoded

AttributeError: head not found

### Insert the data into train and test 

In [13]:
train_df, test_df = train_test_split(df, test_size=0.25, random_state=92)
train_df.head()

Unnamed: 0,F,M,married,widowed,single,divorced,age,children,construct,education,...,medicine,retired,secondary,tertiary,postgrad,professional,income,avbal,avtrans,decision
988,1,0,1,0,0,0,51.4,2,0,0,...,0,0,0,0,1,0,2692.48,20954.73,2516.79,
990,0,1,0,0,1,0,20.5,0,1,0,...,0,0,1,0,0,0,2692.48,20954.73,2516.79,
972,1,0,1,0,0,0,52.02,0,0,0,...,0,0,1,0,0,0,889.98,19738.64,1201.9,
989,0,1,1,0,0,0,25.79,0,1,0,...,0,0,1,0,0,0,2692.48,20954.73,2516.79,A
23,0,1,0,0,1,0,27.43,0,0,1,...,0,0,0,1,0,0,4121.55,19983.49,1102.11,


In [14]:
test_df.head()

Unnamed: 0,F,M,married,widowed,single,divorced,age,children,construct,education,...,medicine,retired,secondary,tertiary,postgrad,professional,income,avbal,avtrans,decision
92,1,0,1,0,0,0,66.9,0,0,0,...,0,1,1,0,0,0,1927.01,5491.91,815.74,
209,0,1,0,0,1,0,27.69,0,0,0,...,0,0,1,0,0,0,2082.23,17948.54,982.88,
817,1,0,0,0,0,1,53.72,2,0,0,...,0,0,0,1,0,0,6183.15,25756.82,730.38,
130,1,0,0,0,1,0,28.31,0,0,0,...,0,0,0,1,0,0,7067.84,14085.66,3020.8,
995,1,0,0,0,0,1,47.56,0,0,0,...,0,0,0,0,0,1,11143.41,17825.67,3742.27,


In [15]:
X_train = train_df.loc[:, train_df.columns != "decision"]
y_train = train_df["decision"]
X_train.head()

Unnamed: 0,F,M,married,widowed,single,divorced,age,children,construct,education,...,manuf,medicine,retired,secondary,tertiary,postgrad,professional,income,avbal,avtrans
988,1,0,1,0,0,0,51.4,2,0,0,...,1,0,0,0,0,1,0,2692.48,20954.73,2516.79
990,0,1,0,0,1,0,20.5,0,1,0,...,0,0,0,1,0,0,0,2692.48,20954.73,2516.79
972,1,0,1,0,0,0,52.02,0,0,0,...,0,0,0,1,0,0,0,889.98,19738.64,1201.9
989,0,1,1,0,0,0,25.79,0,1,0,...,0,0,0,1,0,0,0,2692.48,20954.73,2516.79
23,0,1,0,0,1,0,27.43,0,0,1,...,0,0,0,0,1,0,0,4121.55,19983.49,1102.11


### Run the SMOTE algorithm to balance the output classes as mentioned [here](http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.over_sampling.SMOTE.html)

In [16]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X_train, y_train)

In [17]:
print('Resampled dataset shape {}'.format(Counter(y_res)))

Resampled dataset shape Counter({'None': 621, 'A': 621, 'B': 621})


### So SMOTE in Python cannot yet handle categorical data - refer [this](https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python)

In [18]:
X_res

array([[1.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        2.69248000e+03, 2.09547300e+04, 2.51679000e+03],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        2.69248000e+03, 2.09547300e+04, 2.51679000e+03],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        8.89980000e+02, 1.97386400e+04, 1.20190000e+03],
       ...,
       [8.94479931e-01, 1.05520069e-01, 0.00000000e+00, ...,
        1.90025507e+03, 1.67555379e+04, 5.73772250e+02],
       [9.91475335e-01, 8.52466485e-03, 0.00000000e+00, ...,
        4.45949750e+03, 1.74640340e+04, 3.82894098e+03],
       [6.15170262e-02, 9.38482974e-01, 0.00000000e+00, ...,
        1.39235299e+04, 2.97638223e+04, 3.19203978e+03]])

In [19]:
columns_to_not_round = [6, 21, 22, 23]
sampled_data = ",".join(headers) + "\n" 
    
for itr in range(0, len(X_res)):
    for jtr in range(0, len(X_res[0])):
        if jtr not in columns_to_not_round:
            sampled_data += str(round(X_res[itr][jtr])) + ","
        else:
            sampled_data += str(X_res[itr][jtr]) + ","
    sampled_data += str(y_res[itr]) + "\n"
    
with open("data_details/trial_promo_training.csv", "w") as fw:
    fw.writelines(sampled_data)
    

In [20]:
test_df.to_csv("data_details/trial_promo_testing.csv", index = False)