In [95]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [96]:
def Resample(df, target, resampling, filename):
    
    X = df.drop(columns=[target])
    y = df[target].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    
    # Put the testing data back together
    df_test = X_test.copy()
    df_test[target] = y_test
    
    df_train = X_train.copy()
    df_train[target] = y_train
    
    df_train.sort_index().reset_index(drop=True).to_csv('../data/train/noresampling/'+filename+'_0.csv')
    df_test.sort_index().reset_index(drop=True).to_csv('../data/test/'+filename+'.csv')
    
    print("_______________________________________________________________")
    
    print('No Resampling:\n'+ str(df_train[target].value_counts()))
    
    # Get the counts of majority and minority classes
    count_class_0 = df_train[target].value_counts().get(0)
    count_class_1 = df_train[target].value_counts().get(1)
        
    # Slit into both classes
    df_class_0 = df_train[df_train[target] == 0]
    df_class_1 = df_train[df_train[target] == 1]
    
    if (resampling=="under"):
        
        # Undersample
        if count_class_0 > count_class_1:
            print("More 0 than 1")
            df_class_0_under = df_class_0.sample(count_class_1)
            df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)
        else:
            print("More 1 than 0")
            df_class_1_under = df_class_1.sample(count_class_0)
            df_test_under = pd.concat([df_class_1_under, df_class_0], axis=0)
             
        # Split back into X_train and y_train    
        print('Random under-sampling:\n'+ str(df_test_under[target].value_counts()))
        X_train = df_test_under.drop(columns=[target])
        y_train = df_test_under[target].values
        
        # Save
        df_test_under.sort_index().reset_index(drop=True).to_csv('../data/train/undersampled/'+filename+'_0.csv')
        
    elif (resampling=="over"):

        # Oversample
        if count_class_1 < count_class_0:
            df_class_1_over = df_class_1.sample(count_class_0, replace=True)
            df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)
        else:
            df_class_0_over = df_class_0.sample(count_class_1, replace=True)
            df_test_over = pd.concat([df_class_1, df_class_0_over], axis=0)
        
        # Split back into X_train and y_train    
        print('Random over-sampling:\n'+ str(df_test_over[target].value_counts()))
        X_train = df_test_over.drop(columns=[target])
        y_train = df_test_over[target].values
        
        # Save
        df_test_over.sort_index().reset_index(drop=True).to_csv('../data/train/oversampled/'+filename+'_0.csv')
        

In [97]:
filename1 = 'online_shoppers_intentions'
df1 = pd.read_csv("../data/processed/"+filename1+".csv", index_col=0)
target1 = 'Revenue'

Resample(df1, target1, "under", filename1)
Resample(df1, target1, "over", filename1)

_______________________________________________________________
No Resampling:
0    7295
1    1336
Name: Revenue, dtype: int64
More 0 than 1
Random under-sampling:
1    1336
0    1336
Name: Revenue, dtype: int64
_______________________________________________________________
No Resampling:
0    7295
1    1336
Name: Revenue, dtype: int64
Random over-sampling:
1    7295
0    7295
Name: Revenue, dtype: int64


In [98]:
filename2 = 'marketing_campaign'
df2 = pd.read_csv("../data/processed/"+filename2+".csv", index_col=0)
target2 = 'Teenhome'

Resample(df2, target2, "under", filename2)
Resample(df2, target2, "over", filename2)

_______________________________________________________________
No Resampling:
0    811
1    757
Name: Teenhome, dtype: int64
More 0 than 1
Random under-sampling:
1    757
0    757
Name: Teenhome, dtype: int64
_______________________________________________________________
No Resampling:
0    811
1    757
Name: Teenhome, dtype: int64
Random over-sampling:
1    811
0    811
Name: Teenhome, dtype: int64


In [99]:
filename3 = 'heart'
df3 = pd.read_csv("../data/processed/"+filename3+".csv", index_col=0)
target3 = 'target'

Resample(df3, target3, "under", filename3)
Resample(df3, target3, "over", filename3)

_______________________________________________________________
No Resampling:
1    115
0     97
Name: target, dtype: int64
More 1 than 0
Random under-sampling:
1    97
0    97
Name: target, dtype: int64
_______________________________________________________________
No Resampling:
1    115
0     97
Name: target, dtype: int64
Random over-sampling:
1    115
0    115
Name: target, dtype: int64
