In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import SMOTE


## Import Data

In [86]:
base_dir = "../data/"
headers = ["train","test"]
datasets = {}
for header in headers:
    filepath = (base_dir+"raw/" + "RAID0-"+header+"-8.csv")
    datasets[header + "-raw"] = pd.read_csv(filepath, header=None)
    filepath = (base_dir+"processed/" + "RAID0-"+header+"-8-decimal.csv")
    datasets[header + "-decimal"] = pd.read_csv(filepath)

In [87]:
datasets["train-raw"].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,151,152,153,154,155,156,157,158,159,160
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,1,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,1,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,0,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0


In [88]:
datasets["train-decimal"].head()

Unnamed: 0,target,pending_1,pending_2,pending_3,pending_4,ebusy
0,41484,58017,43907,27559,4414,0
1,50667,27964,20245,38763,49381,0
2,18180,35005,38883,25895,48760,0
3,47150,61601,56266,22271,47014,0
4,37502,36211,1174,21981,23085,0


## Create Splitted Data 
### (X, y representation)

In [89]:
class Data:
    def __init__(self,X,y):
        self.X = X
        self.y = y

In [90]:
splitted_data = {}
for key in datasets:
    if ("decimal" in key):
        X = datasets[key].iloc[:,:5]
        y = datasets[key].iloc[:,5]
    else:
        X = datasets[key].iloc[:,:160]
        y = datasets[key].iloc[:,160]
    splitted_data[key] = Data(X,y)

In [91]:
splitted_data["train-decimal"].X.head()

Unnamed: 0,target,pending_1,pending_2,pending_3,pending_4
0,41484,58017,43907,27559,4414
1,50667,27964,20245,38763,49381
2,18180,35005,38883,25895,48760
3,47150,61601,56266,22271,47014
4,37502,36211,1174,21981,23085


In [92]:
splitted_data["test-decimal"].y.head()

0    0
1    1
2    1
3    0
4    0
Name: ebusy, dtype: int64

In [93]:
splitted_data["test-raw"].X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,150,151,152,153,154,155,156,157,158,159
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,1,0,0,1


In [94]:
splitted_data["train-raw"].y.head()

0    0
1    0
2    0
3    0
4    0
Name: 160, dtype: int64

## SMOTE Upsampling

In [102]:
upsampled_data = {}
for key in splitted_data:
    X_upsampled, y_upsampled = SMOTE().fit_resample(splitted_data[key].X , splitted_data[key].y)
    
    if ("decimal" in key):
        X_upsampled_renamed = {'target': X_upsampled[:,0].tolist(), 
                               'pending_1':X_upsampled[:,1].tolist(),
                               'pending_2': X_upsampled[:,2].tolist(), 
                               'pending_3': X_upsampled[:,3].tolist(), 
                               'pending_4': X_upsampled[:,4].tolist() }

        y_upsampled_renamed = {'ebusy' : y_upsampled.tolist()}
    
        upsampled_data[key] = Data(pd.DataFrame(X_upsampled_renamed), pd.DataFrame(y_upsampled_renamed))
    else:
        upsampled_data[key] = Data(pd.DataFrame(X_upsampled), pd.DataFrame(y_upsampled))

## Dump upsampled data

In [103]:
base_dir = "../data/processed/"

In [104]:
for key in upsampled_data:
    data = pd.concat([upsampled_data[key].X, upsampled_data[key].y], axis=1)
    data.to_csv(base_dir+"RAID0-"+key+"-8-SMOTE.csv", index=None)