In [1]:
import numpy as np
import pandas as pd

In [84]:
# Musk dataset version 2
# source and info:
# https://archive.ics.uci.edu/ml/datasets/Musk+%28Version+2%29
path = './raw/musk_ver2/'
df = pd.read_csv(path + 'clean2.data',  header=None)
column_names = ['mol_name', 'conf_name']
column_names.extend(list(range(1, 163)))
column_names.extend(['oxy_dis', 'oxy_x', 'oxy_y', 'oxy_z', 'class_'])
df.columns = column_names
y_data = df.class_.astype('int64')
x_data = df.drop(['class_', 'mol_name', 'conf_name'], axis=1)
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')

y value counts: 
0    5581
1    1017
Name: class_, dtype: int64
x na count total: 
0


In [79]:
# colposcopy data
# source and info: 
# https://archive.ics.uci.edu/ml/datasets/Quality+Assessment+of+Digital+Colposcopies
path = './raw/colposcopy/'
df_green = pd.read_csv(path + 'green.csv')
df_hinselmann = pd.read_csv(path + 'hinselmann.csv')
df_schiller = pd.read_csv(path + 'schiller.csv')
df = df_green.append([df_hinselmann, df_schiller])
df = df.reset_index()
del df['index']

y_data = df.consensus
# columns 62 to 68, starting with "experts", are also target labels.
# the column 'consensus' is made from these columns
x_data = df.iloc[:,:62]
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')

y value counts: 
1.0    216
0.0     71
Name: consensus, dtype: int64
x na count total: 
0


In [77]:
# Z-Alizadeh Sani Data Set 
# source and info
# https://archive.ics.uci.edu/ml/datasets/Z-Alizadeh+Sani
path = './raw/CAD_diagnosis/'
df = pd.read_excel(path + 'CAD_diagnosis.xlsx')
y_data = df.Cath.apply(lambda x: 1 if x == 'Cad' else 0)
x_data = pd.get_dummies(df.drop('Cath', axis=1), drop_first=True, 
                        dtype='int64')
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')

y value counts: 
1    216
0     87
Name: Cath, dtype: int64
x na count total: 
0


In [76]:
# spambase
# source and info:
# https://archive.ics.uci.edu/ml/datasets/Spambase

path = './raw/spambase/'
df = pd.read_csv(path + 'spambase.data', header=None)
df.head()
y_data = df[57]
x_data = df.drop(57, axis=1)
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')

y value counts: 
0    2788
1    1813
Name: 57, dtype: int64
x na count total: 
0


In [80]:
# sports articles for objectivity analysis
# source and info:
# https://archive.ics.uci.edu/ml/datasets/Sports+articles+for+objectivity+analysis
path = './raw/sports_articles_objectivity/'
df = pd.read_csv(path + 'features.csv')
df = df.drop(['TextID', 'URL'], axis=1)
y_data = df.Label.apply(lambda x: 1 if x == 'subjective' else 0)
x_data = df.drop('Label', axis=1)
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')

y value counts: 
0    635
1    365
Name: Label, dtype: int64
x na count total: 
0


In [82]:
# sonar detection. mines vs rocks
# sources and info:
# https://archive.ics.uci.edu/ml/datasets/Connectionist+Bench+%28Sonar%2C+Mines+vs.+Rocks%29
path = './raw/sonar_mines_rocks/'
df = pd.read_csv(path + 'sonar.all-data', header=None)
df.head()
y_data = df[60].apply(lambda x: 1 if x == 'R' else 0)
x_data = df[:60]
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')

y value counts: 
0    111
1     97
Name: 60, dtype: int64
x na count total: 
0


In [119]:
# first-order theorem proving
# sources and info:
# https://archive.ics.uci.edu/ml/datasets/First-order+theorem+proving
path = './raw/first_order_theorem_proving///'
df = pd.read_csv(path + 'train.csv', header=None)
df = df.append(pd.read_csv(path + 'test.csv', header=None))
df = df.append(pd.read_csv(path + 'validation.csv', header=None))
y_data = df[56].apply(lambda x: 1 if x == 1 else 0)
x_data = df.iloc[:,:51]
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')

y value counts: 
0    3564
1    2554
Name: 56, dtype: int64
x na count total: 
0


In [160]:
# secom
# source and info:
# https://archive.ics.uci.edu/ml/datasets/SECOM
path = './raw/secom/'
x_data = pd.read_csv(path + 'secom.data', delimiter=' ',header=None)
y_data = pd.read_csv(path + 'secom_labels.data', delimiter=' ', header=None)
y_data = y_data[0].apply(lambda x: 1 if x == 1 else 0)
x_data = x_data.fillna(x_data.mean())
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')

y value counts: 
0    1463
1     104
Name: 0, dtype: int64
x na count total: 
0


In [12]:
# Epileptic Seizure Recognition dataset 
# source and info:
# https://archive.ics.uci.edu/ml/datasets/Epileptic+Seizure+Recognition
path = './raw/epileptic_seizure///'
df = pd.read_csv(path + 'data.csv')

# classes 2 to 5 are subjects who did not have epileptic seizure, only
# class 1 means positive
y_data = df['y'].apply(lambda x: 1 if x == 1 else 0)
x_data = df.drop(['y', 'Unnamed: 0'], axis=1)
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')


y value counts: 
0    9200
1    2300
Name: y, dtype: int64
x na count total: 
0


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,X40,X41,X42,X43,X44,X45,X46,X47,X48,X49,X50,X51,X52,X53,X54,X55,X56,X57,X58,X59,X60,X61,X62,X63,X64,X65,X66,X67,X68,X69,X70,X71,X72,X73,X74,X75,X76,X77,X78,X79,X80,X81,X82,X83,X84,X85,X86,X87,X88,X89,X90,X91,X92,X93,X94,X95,X96,X97,X98,X99,X100,X101,X102,X103,X104,X105,X106,X107,X108,X109,X110,X111,X112,X113,X114,X115,X116,X117,X118,X119,X120,X121,X122,X123,X124,X125,X126,X127,X128,X129,X130,X131,X132,X133,X134,X135,X136,X137,X138,X139,X140,X141,X142,X143,X144,X145,X146,X147,X148,X149,X150,X151,X152,X153,X154,X155,X156,X157,X158,X159,X160,X161,X162,X163,X164,X165,X166,X167,X168,X169,X170,X171,X172,X173,X174,X175,X176,X177,X178
0,135,190,229,223,192,125,55,-9,-33,-38,-10,35,64,113,152,164,127,50,-47,-121,-138,-125,-101,-50,11,39,24,48,64,46,13,-19,-61,-96,-130,-132,-116,-115,-71,-14,25,19,6,9,21,13,-37,-58,-33,5,47,80,101,88,73,69,41,-13,-31,-61,-80,-77,-66,-43,5,87,129,121,88,12,-76,-150,-207,-186,-165,-148,-103,-33,40,94,75,8,-81,-155,-227,-262,-233,-218,-187,-126,-65,-12,27,61,49,9,-46,-124,-210,-281,-265,-181,-89,-4,53,53,38,43,31,34,9,-7,-34,-70,-84,-101,-70,-11,42,62,66,74,64,59,56,36,-11,-30,-43,-23,8,42,77,103,135,121,79,59,43,54,90,111,107,64,32,18,-25,-69,-65,-44,-33,-57,-88,-114,-130,-114,-83,-53,-79,-72,-85,-109,-98,-72,-65,-63,-11,10,8,-17,-15,-31,-77,-103,-127,-116,-83,-51
1,386,382,356,331,320,315,307,272,244,232,237,258,212,2,-267,-605,-850,-1001,-1109,-1090,-967,-746,-464,-152,118,318,427,473,485,447,397,339,312,314,326,335,332,324,310,312,309,309,303,297,295,295,293,286,279,283,301,308,285,252,215,194,169,111,-74,-388,-679,-892,-949,-972,-1001,-1006,-949,-847,-668,-432,-153,72,226,326,392,461,495,513,511,496,479,453,440,427,414,399,385,385,404,432,444,437,418,392,373,363,365,372,385,388,383,371,360,353,334,303,252,200,153,151,143,48,-206,-548,-859,-1067,-1069,-957,-780,-597,-460,-357,-276,-224,-210,-350,-930,-1413,-1716,-1360,-662,-96,243,323,241,29,-167,-228,-136,27,146,229,269,297,307,303,305,306,307,280,231,159,85,51,43,62,63,63,69,89,123,136,127,102,95,105,131,163,168,164,150,146,152,157,156,154,143,129
2,-32,-39,-47,-37,-32,-36,-57,-73,-85,-94,-99,-94,-96,-104,-103,-92,-75,-69,-69,-53,-37,-14,-10,-39,-78,-102,-98,-80,-54,-40,-35,-39,-32,-13,7,34,41,33,6,-15,-30,-47,-53,-65,-64,-68,-85,-98,-109,-82,-57,-38,-40,-36,-31,-13,11,19,9,-20,-48,-71,-71,-57,-32,-13,6,29,27,25,10,-7,-36,-47,-37,-36,-22,-32,-38,-55,-61,-64,-72,-67,-53,-25,-10,-4,-23,-55,-93,-102,-106,-101,-69,-45,-42,-57,-64,-77,-80,-77,-78,-56,-34,-5,10,5,-5,-44,-75,-99,-110,-104,-103,-94,-105,-108,-110,-99,-89,-82,-76,-80,-90,-106,-106,-108,-87,-60,-37,-26,-15,-6,-14,-23,-34,-41,-54,-82,-107,-126,-124,-108,-84,-68,-61,-56,-63,-62,-33,1,28,45,37,48,62,80,66,23,-11,-39,-44,-42,-45,-48,-42,-6,29,57,64,48,19,-12,-30,-35,-35,-36
3,-105,-101,-96,-92,-89,-95,-102,-100,-87,-79,-72,-68,-74,-80,-83,-73,-68,-61,-58,-59,-64,-79,-84,-97,-94,-84,-77,-75,-72,-68,-76,-76,-72,-67,-69,-69,-69,-67,-68,-69,-67,-66,-58,-54,-56,-70,-80,-82,-85,-74,-70,-71,-82,-88,-93,-97,-89,-87,-83,-70,-50,-37,-31,-32,-39,-54,-64,-68,-67,-69,-63,-60,-63,-55,-43,-37,-27,-31,-35,-47,-58,-63,-74,-73,-67,-60,-56,-49,-46,-57,-58,-62,-63,-63,-61,-56,-65,-62,-57,-61,-63,-66,-69,-86,-89,-86,-83,-87,-80,-69,-62,-57,-60,-60,-68,-58,-53,-57,-66,-66,-73,-78,-73,-84,-92,-97,-88,-81,-72,-61,-66,-72,-88,-90,-88,-77,-58,-53,-61,-69,-66,-74,-69,-61,-51,-45,-45,-49,-58,-64,-78,-80,-90,-87,-83,-78,-64,-38,-22,-29,-42,-51,-68,-71,-69,-69,-74,-74,-80,-82,-81,-80,-77,-85,-77,-72,-69,-65
4,-9,-65,-98,-102,-78,-48,-16,0,-21,-59,-90,-103,-84,-43,-9,3,-21,-60,-96,-103,-75,-29,14,55,78,73,28,-13,-43,-68,-78,-75,-55,-41,-19,-20,-29,-36,-20,1,16,14,-14,-42,-56,-45,-45,-45,-38,-47,-45,-37,-3,23,39,27,0,-28,-44,-37,-22,5,30,31,6,-32,-27,-27,2,13,-6,-29,-41,-22,-13,-16,-31,-52,-60,-40,-16,0,14,24,36,39,34,17,-7,-14,-1,16,27,28,18,-2,-8,9,27,23,21,10,15,22,41,49,55,57,46,37,31,40,38,35,30,3,-34,-51,-42,-23,-1,23,35,35,17,-1,-17,-8,26,55,54,38,19,4,-1,10,22,26,37,38,26,10,-4,-13,-8,0,10,19,29,57,63,45,7,-13,-23,-9,9,11,3,-1,-2,4,18,27,27,14,15,11,10,4,2,-12,-32,-41,-65,-83,-89,-73
5,55,28,18,16,16,19,25,40,52,66,81,98,111,122,105,85,66,51,34,19,16,8,-5,-21,-18,-14,3,27,46,57,60,51,35,22,20,28,28,35,43,46,62,83,92,100,107,100,82,65,42,20,-8,-7,17,53,82,98,96,86,71,51,32,23,23,30,27,32,33,29,24,23,24,45,74,96,106,105,93,75,47,37,31,25,22,21,17,34,43,52,47,45,56,67,75,71,50,7,-28,-59,-71,-61,-28,4,26,44,42,26,2,-14,-18,-20,-14,-12,-14,-21,-36,-39,-31,3,30,42,35,32,24,20,11,-1,-26,-51,-75,-91,-81,-36,-2,30,34,28,14,2,-1,-4,-17,-30,-45,-71,-81,-78,-69,-60,-45,-33,-25,-14,-2,3,-5,-18,-39,-61,-73,-77,-76,-79,-64,-40,-25,0,9,12,-6,-12,-31,-42,-54,-60,-64,-60,-56,-55
6,-55,-9,52,111,135,129,103,72,37,0,-38,-77,-113,-128,-121,-105,-71,-27,13,44,60,64,40,30,25,28,34,31,18,-19,-53,-76,-81,-74,-57,-36,-21,12,55,99,115,118,98,67,25,-16,-50,-66,-51,-18,17,48,68,77,75,75,63,48,36,33,20,-3,-22,-35,-22,8,61,112,140,138,101,57,22,4,-7,-7,1,6,-2,-13,-11,-13,3,25,49,65,72,79,75,75,75,72,61,43,22,-14,-35,-47,-44,-39,-32,-25,-12,11,42,70,79,82,66,40,22,11,5,-20,-35,-56,-69,-73,-59,-33,6,59,99,123,118,96,58,15,-15,-42,-47,-42,-30,-22,-12,9,29,46,68,94,109,107,88,52,5,-24,-37,-50,-58,-66,-65,-60,-45,-20,1,23,50,74,99,125,141,129,95,41,-21,-77,-117,-135,-137,-125,-99,-79,-62,-41,-26,11,67,128
7,1,-2,-8,-11,-12,-17,-15,-16,-18,-17,-19,-18,-16,-15,-14,-21,-19,-24,-24,-24,-17,-20,-23,-15,-17,-20,-18,-19,-20,-19,-18,-20,-25,-27,-24,-22,-20,-9,0,12,18,25,23,20,17,12,6,-1,-5,-10,-13,-13,-17,-20,-20,-19,-20,-21,-22,-24,-27,-29,-31,-36,-45,-49,-60,-71,-83,-89,-97,-103,-105,-103,-104,-97,-99,-99,-101,-96,-91,-78,-64,-48,-36,-23,-15,-14,-17,-18,-15,-14,-13,-12,-17,-21,-22,-23,-14,-12,-9,-12,-18,-16,-19,-23,-21,-18,-17,-15,-10,-7,-9,-7,-2,0,11,18,26,30,30,39,38,28,14,4,-8,-9,-9,-8,-3,3,1,-4,-12,-15,-20,-25,-23,-20,-26,-24,-25,-35,-41,-41,-53,-61,-58,-59,-55,-53,-65,-78,-87,-97,-100,-106,-104,-107,-110,-110,-109,-104,-118,-111,-102,-80,-67,-79,-91,-97,-88,-76,-72,-66,-57,-39
8,-278,-246,-215,-191,-177,-167,-157,-139,-118,-92,-63,-39,-11,14,36,60,70,78,79,69,27,-45,-123,-183,-218,-242,-256,-256,-236,-205,-165,-125,-84,-41,-10,12,35,58,71,85,98,107,106,97,77,46,-2,-77,-130,-142,-141,-120,-144,-170,-189,-215,-237,-242,-236,-208,-163,-98,-28,29,70,92,102,113,122,129,125,123,118,117,116,116,108,88,62,22,-27,-85,-90,-71,-47,-55,-107,-169,-194,-210,-202,-186,-145,-99,-53,-17,1,15,23,33,44,56,77,100,123,144,155,158,154,151,152,146,143,131,119,93,39,-63,-203,-315,-352,-311,-254,-207,-188,-180,-170,-149,-120,-87,-45,-3,29,45,52,50,51,59,64,74,79,81,76,65,63,62,65,67,70,63,45,20,-11,-32,-26,3,40,85,124,182,248,349,418,419,291,73,-152,-311,-386,-400,-379,-336,-281,-226,-174,-125,-79,-40
9,8,15,13,3,-6,-8,-5,4,25,41,48,44,34,16,-2,-11,-24,11,33,43,48,42,33,14,-1,-7,-17,-36,-59,-74,-88,-84,-64,-37,-8,24,47,49,42,22,0,-21,-26,-18,-12,-7,-9,-16,-31,-37,-41,-36,-26,-6,12,18,30,38,48,56,57,52,34,17,3,3,12,30,35,19,7,-17,-27,-24,-7,14,20,17,2,-11,-14,-4,7,16,20,12,-4,-12,-1,14,35,45,39,10,-19,-42,-62,-68,-62,-44,-20,-5,10,0,-16,-10,9,27,35,25,-3,-25,-47,-58,-61,-42,-7,26,54,63,53,26,4,-1,2,11,17,12,-5,-12,-21,-22,-19,-13,-8,1,24,36,43,39,29,12,-8,-10,-13,-12,-7,1,3,6,13,12,7,0,2,2,15,28,27,18,2,-5,-6,13,41,66,72,68,65,49,31,11,-5,-17,-19,-15,-15,-11


In [15]:
# Santander customer satisfaction
# source: https://www.kaggle.com/c/santander-customer-satisfaction/data
path = './raw/santander_customer_satisfaction/'
df = pd.read_csv(path + 'train.csv')
y_data = df.TARGET
x_data = df.drop(['TARGET', 'ID'], axis=1)
print(f'y value counts: \n{y_data.value_counts()}')
print(f'x na count total: \n{x_data.isna().sum().sum()}')

y value counts: 
0    73012
1     3008
Name: TARGET, dtype: int64
x na count total: 
0
