In [1]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import ADASYN

%matplotlib inline


In [2]:
df = pd.read_csv("database/ctu_df.csv")
df.head()


Unnamed: 0,ID,pH,BDecf,pCO2,BE,Apgar1,Apgar5,Gest. Weeks,Weight(g),Sex,...,Median_UC,Std_FHR,Std_UC,RMS_FHR,RMS_UC,Peak_to_RMS_FHR,Peak_to_RMS_UC,Peak_FHR,Peak_UC,target
0,1220,7.3,3.52,6.0,-4.7,9,10,42,3100.0,2,...,23.0,59.376698,21.970835,122.35702,33.143752,70.64298,93.856248,193.0,127.0,1
1,1234,7.29,2.5,6.5,-4.2,8,9,41,3200.0,1,...,20.5,59.311329,24.685834,135.710988,34.975993,88.289012,92.024007,224.0,127.0,1
2,1208,7.23,5.84,6.6,-7.4,9,9,40,3900.0,2,...,3.0,51.554942,21.705502,109.963878,26.323443,64.536122,73.676557,174.5,100.0,1
3,1038,7.33,2.72,5.7,-4.0,10,10,39,2740.0,1,...,6.0,26.070361,30.393657,135.560556,37.570082,31.189444,62.429918,166.75,100.0,1
4,1004,7.3,5.19,5.5,-6.4,8,9,41,3370.0,1,...,14.0,51.581668,16.210997,105.951334,22.31554,123.048666,100.68446,229.0,123.0,1


In [5]:
np.any(np.isnan(df))


True

In [6]:
np.all(np.isfinite(df))


False

In [7]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)


df = clean_dataset(df)


In [8]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

print('Distribution:', Counter(y))
# here we can see that there is huge imbalance in the classes


Distribution: Counter({1.0: 488, 0.0: 45})


In [9]:
ada = ADASYN(sampling_strategy='minority', random_state=420, n_neighbors=5)
X_res, y_res = ada.fit_resample(X, y)

print('Oversampled Target Variable Distribution:', Counter(y_res))
# now the label count is comparable

Oversampled Target Variable Distribution: Counter({1.0: 488, 0.0: 487})


In [10]:
feature = pd.DataFrame(X_res, columns=df.columns[:-1])
target = pd.DataFrame(y_res, columns=["target"])

full = pd.concat([feature, target], axis=1)
full


Unnamed: 0,ID,pH,BDecf,pCO2,BE,Apgar1,Apgar5,Gest. Weeks,Weight(g),Sex,...,Median_UC,Std_FHR,Std_UC,RMS_FHR,RMS_UC,Peak_to_RMS_FHR,Peak_to_RMS_UC,Peak_FHR,Peak_UC,target
0,1220.000000,7.300000,3.520000,6.000000,-4.700000,9.000000,10.000000,42.000000,3100.000000,2.000000,...,23.000000,59.376698,21.970835,122.357020,33.143752,70.642980,93.856248,193.000000,127.000000,1.0
1,1234.000000,7.290000,2.500000,6.500000,-4.200000,8.000000,9.000000,41.000000,3200.000000,1.000000,...,20.500000,59.311329,24.685834,135.710988,34.975993,88.289012,92.024007,224.000000,127.000000,1.0
2,1208.000000,7.230000,5.840000,6.600000,-7.400000,9.000000,9.000000,40.000000,3900.000000,2.000000,...,3.000000,51.554942,21.705502,109.963878,26.323443,64.536122,73.676557,174.500000,100.000000,1.0
3,1038.000000,7.330000,2.720000,5.700000,-4.000000,10.000000,10.000000,39.000000,2740.000000,1.000000,...,6.000000,26.070361,30.393657,135.560556,37.570082,31.189444,62.429918,166.750000,100.000000,1.0
4,1004.000000,7.300000,5.190000,5.500000,-6.400000,8.000000,9.000000,41.000000,3370.000000,1.000000,...,14.000000,51.581668,16.210997,105.951334,22.315540,123.048666,100.684460,229.000000,123.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970,1850.716669,7.292529,2.843164,6.394902,-4.006973,9.828304,10.000000,40.171696,4081.113390,1.000000,...,18.055669,58.626054,20.264252,94.833736,29.204359,100.353563,79.977739,195.187299,109.182098,0.0
971,1093.368138,7.253080,5.406133,6.286801,-7.024016,10.000000,10.000000,40.000000,3354.316872,1.956004,...,1.791928,68.756955,15.056900,116.791473,17.974871,65.706459,63.411002,182.497932,81.385874,0.0
972,1885.840773,7.306322,2.883183,6.068388,-3.856525,9.452896,9.589672,39.863224,4072.644776,1.000000,...,17.290298,59.463752,19.134497,96.073509,27.673802,103.121013,77.534496,199.194522,105.208298,0.0
973,1406.209311,7.282366,4.343603,6.062723,-5.583493,8.745530,9.372765,40.000000,3359.862732,1.627235,...,11.473180,67.476787,25.362490,114.947461,33.698098,78.941468,70.588700,193.888929,104.286798,0.0


In [None]:
full.to_csv("database/final.csv")
# apply same process for inteerpolated df