In [1]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [68]:
df = pd.read_csv("original_data/trialPromoResults.csv")
df = df.drop(["index"], axis=1)

In [69]:

print('Original dataset shape {0}, {1}'.format(df.shape[0], df.shape[1]))

Original dataset shape 1000, 10


In [70]:
df.head()
headers = df.columns.values
headers

array([' sex', ' mstatus', ' age', ' children', ' occupation',
       ' education', ' income', ' avbal', ' avtrans', ' decision'],
      dtype=object)

In [71]:
# strip the spaces in the header
df = df.rename(columns=lambda x: x.strip())
df.columns.values

array(['sex', 'mstatus', 'age', 'children', 'occupation', 'education',
       'income', 'avbal', 'avtrans', 'decision'], dtype=object)

### Perform one-hot encoding

In [72]:
categorical_features = ["sex", "mstatus", "occupation", "education"]
df[categorical_features].head()

Unnamed: 0,sex,mstatus,occupation,education
0,F,married,legal,secondary
1,M,widowed,retired,tertiary
2,M,single,manuf,professional
3,F,married,education,postgrad
4,M,single,construct,tertiary


In [73]:
label_encoders = {}
label_mappings = {}
for categorical_feature in categorical_features:
    label_encoders[categorical_feature] = preprocessing.LabelEncoder()
    df[categorical_feature + "Num"] = label_encoders[categorical_feature].fit_transform(df[categorical_feature])
    label_mappings[categorical_feature] = label_encoders[categorical_feature].classes_
df.head()

Unnamed: 0,sex,mstatus,age,children,occupation,education,income,avbal,avtrans,decision,sexNum,mstatusNum,occupationNum,educationNum
0,F,married,56.82,1,legal,secondary,3105.39,33003.48,1776.81,,0,1,5,2
1,M,widowed,87.35,3,retired,tertiary,4874.08,18941.99,863.56,,1,3,8,3
2,M,single,28.75,0,manuf,professional,14232.37,30013.32,3231.14,B,1,2,6,1
3,F,married,35.71,0,education,postgrad,3214.93,15423.24,1996.09,,0,1,2,0
4,M,single,20.53,0,construct,tertiary,3214.93,15423.24,1996.09,,1,2,1,3


In [74]:
label_mappings

{'education': array(['postgrad', 'professional', 'secondary', 'tertiary'], dtype=object),
 'mstatus': array(['divorced', 'married', 'single', 'widowed'], dtype=object),
 'occupation': array(['IT', 'construct', 'education', 'finance', 'government', 'legal',
        'manuf', 'medicine', 'retired'], dtype=object),
 'sex': array(['F', 'M'], dtype=object)}

In [75]:
label_encoders

{'education': LabelEncoder(),
 'mstatus': LabelEncoder(),
 'occupation': LabelEncoder(),
 'sex': LabelEncoder()}

### Add new columns for the different types of values for the categorical columns

In [76]:
for categorical_feature in categorical_features:
    for class_value in label_mappings[categorical_feature]:
        df[categorical_feature + "_" + class_value] = df[categorical_feature] == np.array([class_value] * df.shape[0])
        df[categorical_feature + "_" + class_value] = df[categorical_feature + "_" + class_value].astype(int)

df.head()

Unnamed: 0,sex,mstatus,age,children,occupation,education,income,avbal,avtrans,decision,...,occupation_finance,occupation_government,occupation_legal,occupation_manuf,occupation_medicine,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary
0,F,married,56.82,1,legal,secondary,3105.39,33003.48,1776.81,,...,0,0,1,0,0,0,0,0,1,0
1,M,widowed,87.35,3,retired,tertiary,4874.08,18941.99,863.56,,...,0,0,0,0,0,1,0,0,0,1
2,M,single,28.75,0,manuf,professional,14232.37,30013.32,3231.14,B,...,0,0,0,1,0,0,0,1,0,0
3,F,married,35.71,0,education,postgrad,3214.93,15423.24,1996.09,,...,0,0,0,0,0,0,1,0,0,0
4,M,single,20.53,0,construct,tertiary,3214.93,15423.24,1996.09,,...,0,0,0,0,0,0,0,0,0,1


### Insert the data into train and test 

In [77]:
# drop the categorical values
df = df.drop(categorical_features, axis=1)
train_df, test_df = train_test_split(df, test_size=0.25, random_state=92)
train_df.head()

Unnamed: 0,age,children,income,avbal,avtrans,decision,sexNum,mstatusNum,occupationNum,educationNum,...,occupation_finance,occupation_government,occupation_legal,occupation_manuf,occupation_medicine,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary
988,51.4,2,2692.48,20954.73,2516.79,,0,1,6,0,...,0,0,0,1,0,0,1,0,0,0
990,20.5,0,2692.48,20954.73,2516.79,,1,2,1,2,...,0,0,0,0,0,0,0,0,1,0
972,52.02,0,889.98,19738.64,1201.9,,0,1,0,2,...,0,0,0,0,0,0,0,0,1,0
989,25.79,0,2692.48,20954.73,2516.79,A,1,1,1,2,...,0,0,0,0,0,0,0,0,1,0
23,27.43,0,4121.55,19983.49,1102.11,,1,2,2,3,...,0,0,0,0,0,0,0,0,0,1


In [78]:
test_df.head()

Unnamed: 0,age,children,income,avbal,avtrans,decision,sexNum,mstatusNum,occupationNum,educationNum,...,occupation_finance,occupation_government,occupation_legal,occupation_manuf,occupation_medicine,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary
92,66.9,0,1927.01,5491.91,815.74,,0,1,8,2,...,0,0,0,0,0,1,0,0,1,0
209,27.69,0,2082.23,17948.54,982.88,,1,2,4,2,...,0,1,0,0,0,0,0,0,1,0
817,53.72,2,6183.15,25756.82,730.38,,0,0,0,3,...,0,0,0,0,0,0,0,0,0,1
130,28.31,0,7067.84,14085.66,3020.8,,0,2,3,3,...,1,0,0,0,0,0,0,0,0,1
995,47.56,0,11143.41,17825.67,3742.27,,0,0,6,1,...,0,0,0,1,0,0,0,1,0,0


### Save the data into CSVs for modelling

In [None]:
train_df.to_csv("working_data/trial_promo_training.csv", index = False)
test_df.to_csv("working_data/trial_promo_testing.csv", index = False)

### Part 2 - Insert synthetic data using SMOTE

In [79]:
X_train = train_df.loc[:, train_df.columns != "decision"]
X_train_columns_after_one_hot = X_train.columns.values
y_train = train_df["decision"]
X_train.head()

Unnamed: 0,age,children,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,sex_F,...,occupation_finance,occupation_government,occupation_legal,occupation_manuf,occupation_medicine,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary
988,51.4,2,2692.48,20954.73,2516.79,0,1,6,0,1,...,0,0,0,1,0,0,1,0,0,0
990,20.5,0,2692.48,20954.73,2516.79,1,2,1,2,0,...,0,0,0,0,0,0,0,0,1,0
972,52.02,0,889.98,19738.64,1201.9,0,1,0,2,1,...,0,0,0,0,0,0,0,0,1,0
989,25.79,0,2692.48,20954.73,2516.79,1,1,1,2,0,...,0,0,0,0,0,0,0,0,1,0
23,27.43,0,4121.55,19983.49,1102.11,1,2,2,3,0,...,0,0,0,0,0,0,0,0,0,1


### Run the SMOTE algorithm to balance the output classes as mentioned [here](http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.over_sampling.SMOTE.html)

In [80]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X_train, y_train)

In [81]:
print('Resampled dataset shape {}'.format(Counter(y_res)))

Resampled dataset shape Counter({'None': 621, 'A': 621, 'B': 621})


### So SMOTE in Python cannot yet handle categorical data - refer [this](https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python)

In [82]:
X_res

array([[5.14000000e+01, 2.00000000e+00, 2.69248000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.05000000e+01, 0.00000000e+00, 2.69248000e+03, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [5.20200000e+01, 0.00000000e+00, 8.89980000e+02, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [2.36391099e+01, 0.00000000e+00, 1.90025507e+03, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [2.18715742e+01, 0.00000000e+00, 4.45949750e+03, ...,
        0.00000000e+00, 8.52466485e-03, 0.00000000e+00],
       [2.84374935e+01, 0.00000000e+00, 1.39235299e+04, ...,
        9.38482974e-01, 0.00000000e+00, 0.00000000e+00]])

In [84]:
columns_to_round = []
sampled_data = ",".join(np.append(X_train_columns_after_one_hot, "decision")) + "\n" 
    
for itr in range(0, len(X_res)):
    for jtr in range(0, len(X_res[0])):
        if jtr in columns_to_round:
            sampled_data += str(round(X_res[itr][jtr])) + ","
        else:
            sampled_data += str(X_res[itr][jtr]) + ","
    sampled_data += str(y_res[itr]) + "\n"
    
with open("working_data/trial_promo_training_smote.csv", "w") as fw:
    fw.writelines(sampled_data)
    

In [85]:
test_df.to_csv("working_data/trial_promo_testing_smote.csv", index = False)