In [1]:
import pandas as pd 
data = pd.read_csv('category.csv')

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11591 entries, 0 to 11590
Data columns (total 3 columns):
Unnamed: 0     11591 non-null int64
DESCRIPTION    11591 non-null object
CATEGORY       11591 non-null object
dtypes: int64(1), object(2)
memory usage: 271.8+ KB


In [3]:
corpus = []
for i in data.index:
    corpus.append(data['DESCRIPTION'][i])
len(corpus)

11591

In [4]:
# extract features from the text
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 1), stop_words = 'english', max_features = 8100) 

In [5]:
X = vectorizer.fit_transform(corpus)
print(X.shape) 

(11591, 8100)


In [6]:
import numpy as np
import torch
import torchvision
from torch import nn
from torch.autograd import Variable

In [7]:
# Hyperparameters
num_epochs = 10
batch_size = 500
learning_rate = 1e-3

In [8]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.weights = []
        self.encoder = nn.Sequential(
            nn.Linear(90 * 90, 2000),
            nn.ReLU(True),
            nn.Linear(2000, 1000),
            nn.ReLU(True), 
            nn.Linear(1000, 100), 
            nn.ReLU(True), 
            nn.Linear(100, 11))
        self.decoder = nn.Sequential(
            nn.Linear(11, 100),
            nn.ReLU(True),
            nn.Linear(100, 1000),
            nn.ReLU(True),
            nn.Linear(1000, 2000),
            nn.ReLU(True), 
            nn.Linear(2000, 90 * 90), 
            nn.Tanh())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def get_features(self, x):
        return self.encoder(x)

In [9]:
model = autoencoder().cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = 1e-5)
X = X.todense() # csr matrix to numpy

In [10]:
weights = []
for epoch in range(num_epochs):
    for d in X:
        d = torch.from_numpy(d)
        d = d.view(d.size(0), -1)
        d = Variable(d).cuda()
        # forward pass
        output = model(d.float())
        loss = criterion(output.float(), d.float())
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # write weights for last epoch
        if epoch == 9:
            tensor_ = model.get_features(d.float())
            array = tensor_.cpu().detach().numpy()
            weights.append(array)
                  
    print('Epoch [{}/{}], Loss:{:.4f}'
          .format(epoch + 1, num_epochs, loss.data))

Epoch [1/10], Loss:0.0001
Epoch [2/10], Loss:0.0001
Epoch [3/10], Loss:0.0001
Epoch [4/10], Loss:0.0001
Epoch [5/10], Loss:0.0001
Epoch [6/10], Loss:0.0001
Epoch [7/10], Loss:0.0001
Epoch [8/10], Loss:0.0001
Epoch [9/10], Loss:0.0001
Epoch [10/10], Loss:0.0001


In [11]:
len(weights[0][0])

11

In [12]:
# write features to file
wt_file = open('features.category', 'w')

for i in range(11591):
    for j in range(11):
        wt = weights[i][0][j]
        wt_file.write('{wt} '.format(wt = wt))
    wt_file.write('\n')