In [28]:
from pytorch_tabnet.tab_network import EmbeddingGenerator
import torch
import arff
from scipy.io import arff
import pandas as pd
import numpy as np
import os
import re

In [29]:
# Set random seed for reproducibility
torch.manual_seed(0)
np.random.seed(0)

## Read arff file in 

In [30]:
file = 'data_436.arff'
filename = os.path.join('datasets', file)

In [31]:
# read in the ARFF file
with open(filename, "r") as f:
    lines = f.readlines()

In [32]:
cat_index = []
cat_dims = [] #will be the same size as cat_index, but the values will just be the indexes for the categorical variables at their index

In [33]:
data, meta = arff.loadarff(filename)
attr_names = meta.names()
attr_types = meta.types()
cat_index = []
cat_dims = []

for i in range(len(attr_types)):
    if attr_types[i] == 'nominal':
        cat_index.append(i)
        num_cats = len(meta[attr_names[i]][1])
        cat_dims.append(num_cats)


In [34]:
# create a pandas dataframe from the ARFF file
with open(filename, 'r') as f:
    dataset = arff.loadarff(f)
df = pd.DataFrame(dataset[0])

In [35]:
df.head() # I dont think this is a problem 

Unnamed: 0,Whether_of_not_the_TA_is_a_native_English_speaker,Course_instructor,Course,Summer_or_regular_semester,Class_size,binaryClass
0,b'1',23.0,3.0,b'1',19.0,b'P'
1,b'2',15.0,3.0,b'1',17.0,b'P'
2,b'1',23.0,3.0,b'2',49.0,b'P'
3,b'1',5.0,2.0,b'2',33.0,b'P'
4,b'2',7.0,11.0,b'2',55.0,b'P'


In [36]:
input_dim = df.shape

## create embeddings

In [37]:
embedding_generator = EmbeddingGenerator(input_dim=input_dim[1], cat_dims=cat_dims, cat_idxs=cat_index, cat_emb_dim=3) # hard coded cat_emb_dim = 1 (for now)

In [38]:
from sklearn.preprocessing import LabelEncoder

# create a LabelEncoder object
le = LabelEncoder()

# apply LabelEncoder to each column that contains strings
for col in df.select_dtypes(include=['object']):
    df[col] = le.fit_transform(df[col])


In [39]:
df

Unnamed: 0,Whether_of_not_the_TA_is_a_native_English_speaker,Course_instructor,Course,Summer_or_regular_semester,Class_size,binaryClass
0,0,23.0,3.0,0,19.0,1
1,1,15.0,3.0,0,17.0,1
2,0,23.0,3.0,1,49.0,1
3,0,5.0,2.0,1,33.0,1
4,1,7.0,11.0,1,55.0,1
...,...,...,...,...,...,...
146,1,3.0,2.0,1,26.0,0
147,1,10.0,3.0,1,12.0,0
148,0,18.0,7.0,1,48.0,0
149,1,22.0,1.0,1,51.0,0


In [40]:
# expects the data to be a PyTorch tensor not a pandas dataframe
tensor_data = torch.tensor(df.values)

In [41]:

# apply embeddings
embeddings = embedding_generator(tensor_data)

In [42]:
t_np = embeddings.detach().numpy()
df = pd.DataFrame(t_np)
df.to_csv(os.path.join('embeddings', file.replace('.arff', '.csv')), index=False)

# okay so this is huge
print(df.shape)
print(t_np.shape)
print(embeddings.shape)
print(input_dim)

(151, 12)
(151, 12)
torch.Size([151, 12])
(151, 6)


In [43]:
print(embeddings[0])

tensor([ 1.5410, -0.2934, -2.1788, 23.0000,  3.0000,  0.4033,  0.8380, -0.7193,
        19.0000,  0.1227, -0.5663,  0.3731], grad_fn=<SelectBackward0>)
