In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from scipy.io import loadmat, savemat
import io
import sklearn
from sklearn.datasets import load_digits
from sklearn.manifold import TSNE
from sklearn.manifold import Isomap
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.stats import mode
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from mode_inference import mode_inference
from tsne_inference import tsne_inference
%matplotlib inline
%load_ext autoreload
%autoreload 2

### breast cancer data-set

In [13]:
from sklearn.datasets import load_breast_cancer
bc = load_breast_cancer()

In [14]:
bc_data_train, bc_data_test, bc_target_train, bc_target_test = train_test_split(bc.data, bc.target, test_size=0.2,
                                                                               random_state=10)


In [15]:
# normailizing features to have mean 0 and std 1
scaler = StandardScaler()
bc_data_train = scaler.fit_transform(bc_data_train)
bc_data_test = scaler.transform(bc_data_test)

In [16]:
###################
bc_train_mat = {"StockName" : [], 
                "StockData" : bc_data_train, 
                "Score" : np.array(bc_target_train, dtype=float).reshape((-1, 1)),
                "labels" : np.array(bc_target_train, dtype=float).reshape((-1,1))}

###################
bc_test_mat = {"StockName" : [], 
                "StockData" : bc_data_test, 
                "Score" : np.array(bc_target_test, dtype=float).reshape((-1, 1)),
                "labels" : np.array(bc_target_test, dtype=float).reshape((-1, 1))}
savemat("../Experiments_MATLAB/data/breast_cancer_train.mat", bc_train_mat, appendmat=True)
savemat("../Experiments_MATLAB/data/breast_cancer_test.mat", bc_test_mat, appendmat=True)

### Heart beat

In [17]:
hb1 = pd.read_csv("../data/heartbeat/ptbdb_normal.csv", header=None)
hb1["class"] = 0
hb2 = pd.read_csv("../data/heartbeat/ptbdb_abnormal.csv", header=None)
hb2["class"] = 1
hb = pd.concat([hb1,hb2], axis=0)
hb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,179,180,181,182,183,184,185,186,187,class
0,1.0,0.900324,0.35859,0.051459,0.046596,0.126823,0.133306,0.119125,0.110616,0.113047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,0.794681,0.375387,0.116883,0.0,0.171923,0.283859,0.293754,0.325912,0.345083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.909029,0.791482,0.423169,0.186712,0.0,0.007836,0.063032,0.077002,0.074957,0.077342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.0,0.478893,0.05676,0.064176,0.081289,0.072732,0.055619,0.048774,0.054478,0.041643,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.0,0.867238,0.20136,0.099349,0.141336,0.120934,0.108516,0.096393,0.093436,0.100828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [18]:
hb.drop_duplicates(inplace=True)

In [19]:
# train-test split
hb_train, hb_test, hb_train_target, hb_test_target = train_test_split(np.array(hb.iloc[:, :-1]), np.array(hb["class"]),
                                                                     random_state= 10, test_size=0.2)

In [20]:
hb_train_mat = {"StockName" : [], 
                "StockData" : hb_train, 
                "Score" : np.array(hb_train_target, dtype=float).reshape((-1, 1)), 
                "labels" : np.array(hb_train_target, dtype=float).reshape((-1,1))}
savemat("../Experiments_MATLAB/data/hb_train.mat", hb_train_mat, appendmat=True)
############
hb_test_mat = {"StockName" : [], 
                "StockData" : hb_test, 
                "Score" : np.array(hb_test_target, dtype=float).reshape((-1, 1)), 
                "labels" : np.array(hb_test_target, dtype=float).reshape((-1,1))}
savemat("../Experiments_MATLAB/data/hb_test.mat", hb_test_mat, appendmat=True)

### Madelon

In [21]:
madelon = pd.read_csv("../data/madelon_csv.csv")
madelon.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V492,V493,V494,V495,V496,V497,V498,V499,V500,Class
0,485,477,537,479,452,471,491,476,475,473,...,481,477,485,511,485,481,479,475,496,2
1,483,458,460,487,587,475,526,479,485,469,...,478,487,338,513,486,483,492,510,517,2
2,487,542,499,468,448,471,442,478,480,477,...,481,492,650,506,501,480,489,499,498,2
3,480,491,510,485,495,472,417,474,502,476,...,480,474,572,454,469,475,482,494,461,1
4,484,502,528,489,466,481,402,478,487,468,...,479,452,435,486,508,481,504,495,511,1


In [22]:
# train/test split
madelon_X_train, madelon_X_test, madelon_y_train, madelon_y_test = train_test_split(madelon.iloc[:, :-1], 
                                                    madelon.iloc[:, -1], test_size=0.2, random_state=10)

In [23]:
# feature normalisation
madelon_X_train = scaler.fit_transform(madelon_X_train)
madelon_X_test = scaler.transform(madelon_X_test)

In [24]:
madelon_train_mat = {"StockData": madelon_X_train,
                  "Score": np.array(madelon_y_train, dtype=float).reshape((-1,1)),
                    "labels": np.array(madelon_y_train, dtype=float).reshape((-1,1))}

madelon_test_mat = {"StockData": madelon_X_test,
                 "Score": np.array(madelon_y_test, dtype=float).reshape((-1,1)),
                   "labels": np.array(madelon_y_test, dtype=float).reshape((-1,1))}

savemat("../Experiments_MATLAB/data/madelon_train.mat", madelon_train_mat, appendmat=True)
savemat("../Experiments_MATLAB/data/madelon_test.mat", madelon_test_mat, appendmat=True)

### cifar 10

In [25]:
import pickle
def unpickle(file):
    with open(file, 'rb') as fo:
        d = pickle.load(fo, encoding='bytes')
    return d

In [27]:
label_names = unpickle('../data/cifar-10-batches-py/batches.meta')[b'label_names']
batch_1 = unpickle('../data/cifar-10-batches-py/data_batch_1')

In [28]:
cifar_train, cifar_test, cifar_train_target, cifar_test_target = train_test_split(batch_1[b'data'], batch_1[b'labels'],
                                                                                 test_size=0.2, random_state=10)
cifar_train_mat = {"StockData": cifar_train/255.0,
                  "Score": np.array(cifar_train_target, dtype=float).reshape((-1,1)),
                 "labels": np.array(cifar_train_target, dtype=float).reshape((-1,1))}

cifar_test_mat = {"StockData": cifar_test/255.0,
                 "Score": np.array(cifar_test_target, dtype=float).reshape((-1,1)),
                "labels": np.array(cifar_test_target, dtype=float).reshape((-1,1))}

savemat("../Experiments_MATLAB/data/cifar_train.mat", cifar_train_mat, appendmat=True)
savemat("../Experiments_MATLAB/data/cifar_test.mat", cifar_test_mat, appendmat=True)

### EEG Eye State

In [29]:
eeg = pd.read_csv("../data/EEG_Eye_State.csv", header=None)
eeg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,0
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,0
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,0
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,0


In [30]:
# removing the outliers
eeg_data = eeg.iloc[:, :-1]
Q1 = eeg_data.quantile(0.25)
Q3 = eeg_data.quantile(0.75)
IQR = Q3 - Q1
print(IQR)


0     31.28
1     32.31
2     20.51
3     24.10
4     15.39
5     14.88
6     25.64
7     19.48
8     18.46
9     18.98
10    21.02
11    19.49
12    26.67
13    30.77
dtype: float64


In [31]:
eeg = eeg[~((eeg_data < (Q1 - 1.5 * IQR)) |(eeg_data > (Q3 + 1.5 * IQR))).any(axis=1)].reset_index(drop=True)

In [32]:
eeg.shape

(11853, 15)

In [33]:
# train/test split
eeg_train, eeg_test, eeg_train_target, eeg_test_target = train_test_split(eeg.iloc[:, :-1], eeg.iloc[:, -1],
                                                        random_state=10, test_size=0.2)

In [34]:
# normalisation
eeg_train = scaler.fit_transform(eeg_train)
eeg_test = scaler.transform(eeg_test)

In [35]:
eeg_train_mat = {"StockData": eeg_train,
                  "Score": np.array(eeg_train_target, dtype=float).reshape((-1,1)),
                 "labels": np.array(eeg_train_target, dtype=float).reshape((-1,1))}

eeg_test_mat = {"StockData": eeg_test,
                 "Score": np.array(eeg_test_target, dtype=float).reshape((-1,1)),
                "labels": np.array(eeg_test_target, dtype=float).reshape((-1,1))}

savemat("../Experiments_MATLAB/data/eeg_train.mat", eeg_train_mat, appendmat=True)
savemat("../Experiments_MATLAB/data/eeg_test.mat", eeg_test_mat, appendmat=True)

### wine quality

In [36]:
wine = pd.read_csv("../data/winequality-white.csv", delimiter=";")
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [37]:
wine.drop_duplicates(inplace=True)

In [38]:
# defining 3 classes for the problem based on quality value
d = {3:0, 4:0, 5:0,
    6:1,
    7:2, 8:2, 9:2}
wine["quality"] = wine["quality"].map(d)

In [39]:
wine.quality.value_counts()

1    1788
0    1348
2     825
Name: quality, dtype: int64

In [40]:
# train/test split
wine_train, wine_test, wine_train_target, wine_test_target = train_test_split(wine.iloc[:, :-1], wine.iloc[:, -1],
                                                                             random_state=10, test_size=0.2)

In [41]:
# normalisation
wine_train = scaler.fit_transform(wine_train)
wine_test = scaler.transform(wine_test)

In [42]:
wine_train_mat = {"StockData": wine_train,
                  "Score": np.array(wine_train_target, dtype=float).reshape((-1,1)),
                 "labels": np.array(wine_train_target, dtype=float).reshape((-1,1))}

wine_test_mat = {"StockData": wine_test,
                 "Score": np.array(wine_test_target, dtype=float).reshape((-1,1)),
                "labels": np.array(wine_test_target, dtype=float).reshape((-1,1))}

savemat("../Experiments_MATLAB/data/wine_train.mat", wine_train_mat, appendmat=True)
savemat("../Experiments_MATLAB/data/wine_test.mat", wine_test_mat, appendmat=True)

### phishing

In [44]:
phishing_df = pd.read_csv("../data/phishing.csv", header=None)
phishing_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [45]:
d = {-1:0, 1:1}
phishing_df[30] = phishing_df[30].map(d)

In [46]:
phishing = phishing_df.iloc[:, :-1].drop_duplicates()

In [47]:
phishing_target = np.array(phishing_df.iloc[phishing.index, -1])

In [48]:
# all the features are categorical, so we encode them by one-hot encoding
one_hot = OneHotEncoder(sparse=False)
phishing = one_hot.fit_transform(phishing)

In [49]:
phishing_train, phishing_test, phishing_train_target, phishing_test_target = train_test_split(phishing, phishing_target,
                                                                                             random_state=10,
                                                                                             test_size=0.2)

In [50]:
phishing_train_mat = {"StockData": phishing_train,
                  "Score": np.array(phishing_train_target, dtype=float).reshape((-1,1)),
                 "labels": np.array(phishing_train_target, dtype=float).reshape((-1,1))}

phishing_test_mat = {"StockData": phishing_test,
                 "Score": np.array(phishing_test_target, dtype=float).reshape((-1,1)),
                "labels": np.array(phishing_test_target, dtype=float).reshape((-1,1))}

savemat("../Experiments_MATLAB/data/phishing_train.mat", phishing_train_mat, appendmat=True)
savemat("../Experiments_MATLAB/data/phishing_test.mat", phishing_test_mat, appendmat=True)

### Wafer

In [75]:
wafer_df = pd.read_csv("../data/Archive/Wafer/Wafer_TRAIN.tsv", delimiter="\t", header=None)
wafer_df.shape

(1000, 153)

In [76]:
wafer = loadmat("../data/Archive/Wafer/Wafer1000x128.mat")['Data']

In [77]:
wafer_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,143,144,145,146,147,148,149,150,151,152
0,1,-1.602294,-1.670823,-1.693666,-1.699377,-1.699377,-1.70366,-1.70366,-1.70366,-1.70366,...,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432
1,1,1.084591,1.084591,1.084591,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,...,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308
2,1,0.362689,0.362689,0.362689,0.393316,0.362689,0.362689,0.362689,0.362689,0.362689,...,0.393316,0.393316,0.393316,0.362689,0.393316,0.393316,0.393316,0.393316,0.393316,0.393316
3,1,-1.094523,-1.094523,-1.094523,-1.096732,-1.094523,-1.096732,-1.094523,-1.094523,-1.094523,...,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732
4,1,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,...,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761


In [78]:
wafer_target = np.array(wafer_df[0].map({-1:0, 1:1}))

In [79]:
wafer_train, wafer_test, wafer_train_target, wafer_test_target = train_test_split(wafer, wafer_target,
                                                                                             random_state=10,
                                                                                             test_size=0.2)

In [80]:
wafer_train_mat = {"StockData": wafer_train,
                  "Score": np.array(wafer_train_target, dtype=float).reshape((-1,1)),
                 "labels": np.array(wafer_train_target, dtype=float).reshape((-1,1))}

wafer_test_mat = {"StockData": wafer_test,
                 "Score": np.array(wafer_test_target, dtype=float).reshape((-1,1)),
                "labels": np.array(wafer_test_target, dtype=float).reshape((-1,1))}

savemat("../Experiments_MATLAB/data/wafer_train.mat", wafer_train_mat, appendmat=True)
savemat("../Experiments_MATLAB/data/wafer_test.mat", wafer_test_mat, appendmat=True)

### Arrow

In [84]:
arrow = loadmat("../Experiments_MATLAB/data/data_no_orders/Arrow500_1024.mat")['Data']

In [81]:
arrow_df = pd.read_csv("../data/Archive/MixedShapesRegularTrain/MixedShapesRegularTrain_TRAIN.tsv", delimiter="\t", header=None)

In [83]:
arrow_df[0].value_counts()

5    100
4    100
3    100
2    100
1    100
Name: 0, dtype: int64

In [85]:
arrow_target = np.array(arrow_df[0])

In [86]:
arrow_train, arrow_test, arrow_train_target, arrow_test_target = train_test_split(arrow, arrow_target,
                                                                                             random_state=10,
                                                                                             test_size=0.2)

In [87]:
arrow_train_mat = {"StockData": arrow_train,
                  "Score": np.array(arrow_train_target, dtype=float).reshape((-1,1)),
                 "labels": np.array(arrow_train_target, dtype=float).reshape((-1,1))}

arrow_test_mat = {"StockData": arrow_test,
                 "Score": np.array(arrow_test_target, dtype=float).reshape((-1,1)),
                "labels": np.array(arrow_test_target, dtype=float).reshape((-1,1))}

savemat("../Experiments_MATLAB/data/arrow_train.mat", arrow_train_mat, appendmat=True)
savemat("../Experiments_MATLAB/data/arrow_test.mat", arrow_test_mat, appendmat=True)

In [5]:
s = loadmat("../Experiments_MATLAB/data/small_stock.mat")["Score"]
data = loadmat("../Experiments_MATLAB/data/small_stock.mat")["StockData128"]
names = loadmat("../Experiments_MATLAB/data/small_stock.mat")["StockName"]

In [6]:
d = {"StockData": data,
                 "Score": s,
                "StockName": names}
savemat("../Experiments_MATLAB/data/small_stock.mat", d, appendmat=True)