# Test `datasets`

In [1]:
import sys
sys.path.append('..')
from model_reboot.EIF_reboot import ExtendedTree, ExtendedIsolationForest
import numpy as np
import pandas as pd
from utils_reboot.datasets import Dataset
from sklearn.preprocessing import StandardScaler,MinMaxScaler

## `X,y`

In [2]:
dataset = Dataset("wine", path = "../data/real/")
dataset.drop_duplicates()

In [3]:
X_train,X_test,y_train,y_test = dataset.partition_data(dataset.X,dataset.y)
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (119, 13)
X_test shape: (10, 13)
y_train shape: (119,)
y_test shape: (10,)


### `pre_process` with split

In [4]:
X_train,X_test,X,y=dataset.pre_process(X_train,X_test)
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')
print(f'X shape: {X.shape}')

X_train shape: (119, 13)
X_test shape: (10, 13)
y_train shape: (119,)
y_test shape: (10,)
X shape: (129, 13)


In [5]:
X_train

array([[-0.38352175, -1.25265739, -3.38319954, ...,  0.57538935,
        -0.7627292 , -0.2922407 ],
       [-0.44214614, -1.12399951, -0.14889387, ...,  1.37230895,
        -0.98235682,  0.76964154],
       [ 0.01219289, -0.91493047, -1.06293677, ...,  0.29646749,
        -1.09949155, -0.75681418],
       ...,
       [ 0.93552704,  1.43307573, -0.21920486, ..., -1.25752574,
        -1.14341707,  1.79833997],
       [ 0.78896606,  0.07412694,  0.1675056 , ..., -1.21767976,
        -1.05556603,  1.83152379],
       [ 2.19595143,  1.28833562,  1.46825897, ..., -1.17783378,
        -1.08484971, -0.02677014]])

### `pre_process` without `split`

In [6]:
dataset.pre_process(X_train,X_test,split=False)

X_train not loaded. Load it running split_dataset() first


In [13]:
dataset.X.shape

(129, 13)

## `X_train,y_train`

In [11]:
dataset.split_dataset()

In [12]:
X_train1,X_test1,y_train1,y_test1 = dataset.partition_data(dataset.X_train,dataset.y_train)
print(f'X_train shape: {X_train1.shape}')
print(f'X_test shape: {X_test1.shape}')
print(f'y_train shape: {y_train1.shape}')
print(f'y_test shape: {y_test1.shape}')

X_train shape: (93, 13)
X_test shape: (10, 13)
y_train shape: (93,)
y_test shape: (10,)


In [13]:
X_train,X_test,X,y=dataset.pre_process(X_train,X_test)

## Feature Names 

### `pima`

In [10]:
data=pd.read_csv('../data/real/pima.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

### `moodify`

In [12]:
data=pd.read_csv('../data/real/moodify.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,duration (ms),danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spec_rate,Target
0,0,195000.0,0.611,0.614,-8.815,0.0672,0.0169,0.000794,0.753,0.52,128.05,3.446154e-07,0
1,1,194641.0,0.638,0.781,-6.848,0.0285,0.0118,0.00953,0.349,0.25,122.985,1.464234e-07,0
2,2,217573.0,0.56,0.81,-8.029,0.0872,0.0071,8e-06,0.241,0.247,170.044,4.00785e-07,0
3,3,443478.0,0.525,0.699,-4.571,0.0353,0.0178,8.8e-05,0.0888,0.199,92.011,7.959809e-08,0
4,4,225862.0,0.367,0.771,-5.863,0.106,0.365,1e-06,0.0965,0.163,115.917,4.693131e-07,0


In [13]:
data.columns

Index(['Unnamed: 0', 'duration (ms)', 'danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'spec_rate', 'Target'],
      dtype='object')

### `diabetes`

In [14]:
data=pd.read_csv('../data/real/diabetes.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,Target
0,0,80.0,25.19,6.6,140,0
1,1,54.0,27.32,6.6,80,0
2,2,28.0,27.32,5.7,158,0
3,3,36.0,23.45,5.0,155,0
4,4,76.0,20.14,4.8,155,0


In [15]:
data.columns

Index(['Unnamed: 0', 'age', 'bmi', 'HbA1c_level', 'blood_glucose_level',
       'Target'],
      dtype='object')

## Test `feature_ names`

In [4]:
dataset = Dataset("wine", path = "../data/real/")
dataset.feature_names

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [5]:
dataset = Dataset("pima", path = "../data/real/")
dataset.feature_names

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [6]:
dataset = Dataset("moodify", path = "../data/real/")
dataset.feature_names

['duration (ms)',
 'danceability',
 'energy',
 'loudness',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'spec_rate']

In [7]:
dataset = Dataset("diabetes", path = "../data/real/")
dataset.feature_names

['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']