In [1]:
COLAB = True

In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import spacy
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from collections import Counter
from google.colab import drive
import os

In [3]:
RANDOM_STATE = 30255
BATCH_SIZE = 32
MAX_LENGTH = 512
NUM_EPOCHS = 50
LEARNING_RATE = 0.001

In [4]:
# df = pd.read_csv('../data/preprocessed_data.csv')
# df = df[['CLASS', 'SPACY_PREPROCESSED']]
# df = df.dropna()
# # df['PREPROCESSED'] = df['PREPROCESSED'].str.replace(r'<[^<>]*>', '', regex=True) # drop HTML tags


# from sklearn import preprocessing

# le = preprocessing.LabelEncoder()
# le.fit(df['CLASS'])
# df['LABEL'] = le.transform(df['CLASS'])

# display(df['CLASS'].value_counts())

# # shuffle
# df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index()
# df = df[['LABEL', 'SPACY_PREPROCESSED']]

# display(df['LABEL'].value_counts(dropna=False))

# display(df.head())

In [5]:
if COLAB:
  drive.mount('/content/gdrive')
  PATH = "gdrive/Shareddrives/Adv ML Project/Data/"
  df = pd.read_csv(os.path.join(PATH + "preprocessed_data.csv"))

else:
  df = pd.read_csv('../data/preprocessed_data.csv')
  df = df.sample(n=100, random_state=RANDOM_STATE).reset_index()


le = preprocessing.LabelEncoder()
le.fit(df['CLASS'])
df['LABEL'] = le.transform(df['CLASS'])

df.head()

Mounted at /content/gdrive


Unnamed: 0,DESCRIPTION,SUBJECT,MAIN_SUBJECT,CLASS,BERT_TOKENIZED,SPACY_PREPROCESSED,LABEL
0,The United States Department of Energy Vehicle...,"['33 Advanced Propulsion Systems', '36 Materia...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2142, 2...",united states department energy vehicle techno...,0
1,Solar reflective “cool pavements” have been pr...,"['32 Energy Conservation, Consumption, And Uti...","32 Energy Conservation, Consumption, And Utili...","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 5943, 21346, 1...",solar reflective cool pavement propose potenti...,0
2,Inconel 718 alloy is used extensively in aerog...,"['36 Materials Science', '33 Advanced Propulsi...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 4297, 5643, 2...",inconel alloy extensively aerogas turbine allo...,0
3,The Production Tax Credit (PTC) and the Invest...,"['29 Energy Planning, Policy, And Economy', 'P...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc investment tax credi...,0
4,The production tax credit (PTC) promotes wind ...,"['29 Energy Planning, Policy, And Economy', '1...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc promote wind energy ...,0


In [6]:
tok = spacy.load('en_core_web_sm')

In [7]:
def tokenize (text):
    return [token.text for token in tok.tokenizer(text)]

In [8]:
#count number of occurences of each word
counts = Counter()
for index, row in df.iterrows():
    counts.update(tokenize(row['SPACY_PREPROCESSED']))

In [9]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 30719
num_words after: 17571


In [10]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [11]:
# encode from scratch
def encode_sentence(text, vocab2index, N=MAX_LENGTH):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [12]:
df['encoded'] = df['SPACY_PREPROCESSED'].apply(lambda x: np.array(encode_sentence(x,vocab2index ), dtype=object))
df.head()

Unnamed: 0,DESCRIPTION,SUBJECT,MAIN_SUBJECT,CLASS,BERT_TOKENIZED,SPACY_PREPROCESSED,LABEL,encoded
0,The United States Department of Energy Vehicle...,"['33 Advanced Propulsion Systems', '36 Materia...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2142, 2...",united states department energy vehicle techno...,0,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,Solar reflective “cool pavements” have been pr...,"['32 Energy Conservation, Consumption, And Uti...","32 Energy Conservation, Consumption, And Utili...","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 5943, 21346, 1...",solar reflective cool pavement propose potenti...,0,"[[87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, ..."
2,Inconel 718 alloy is used extensively in aerog...,"['36 Materials Science', '33 Advanced Propulsi...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 4297, 5643, 2...",inconel alloy extensively aerogas turbine allo...,0,"[[179, 180, 181, 1, 182, 180, 183, 184, 185, 1..."
3,The Production Tax Credit (PTC) and the Invest...,"['29 Energy Planning, Policy, And Economy', 'P...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc investment tax credi...,0,"[[56, 248, 249, 250, 251, 248, 249, 252, 253, ..."
4,The production tax credit (PTC) promotes wind ...,"['29 Energy Planning, Policy, And Economy', '1...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc promote wind energy ...,0,"[[56, 248, 249, 250, 267, 256, 5, 268, 101, 26..."


In [13]:
Counter(df['CLASS'])

Counter({'Energy Storage, Conversion, and Utilization': 1223,
         'Environmental Sciences': 1223,
         'Fission and Nuclear Technologies': 1223,
         'Fossil Fuels': 1223,
         'Renewable Energy Sources': 1223})

In [14]:
X = list(df['encoded'])
y = list(df['LABEL'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      train_size=0.7, test_size=0.15, 
                                                      random_state=RANDOM_STATE,
                                                      shuffle=True)

In [15]:
class ResearchDataSet(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [16]:
train_ds = ResearchDataSet(X_train, y_train)
valid_ds = ResearchDataSet(X_valid, y_valid)

In [17]:
def train_model(model, epochs=NUM_EPOCHS, lr=LEARNING_RATE):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = validation_metrics(model, val_dl)
        if i % 1 == 0:
            print("epoch: %.0f\ntrain loss %.3f, val loss %.3f, val accuracy %.3f" % (i, sum_loss/total, val_loss, val_acc))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [18]:
VOCAB_SIZE = len(words)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE)

LSTM with Fixed-Length Input

In [19]:
DROPOUT = 0.1

class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(DROPOUT)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [20]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 512
# model_fixed =  LSTM_fixed_len(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM)

In [21]:
# train_model(model_fixed, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

LSTM with Variable-Length Input

In [22]:
DROPOUT = 0.3

class LSTM_variable_input(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(DROPOUT)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [23]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 128
model_variable =  LSTM_variable_input(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM)

# worked:
# EMBEDDING_DIM = 300
# HIDDEN_DIM = 256
# RANDOM_STATE = 30255
# BATCH_SIZE = 32
# MAX_LENGTH = 512
# NUM_EPOCHS = 50
# LEARNING_RATE = 0.01
# DROPOUT = 0.1

In [24]:
train_model(model_variable, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

epoch: 0
train loss 1.517, val loss 1.389, val accuracy 0.422
epoch: 1
train loss 1.206, val loss 1.130, val accuracy 0.595
epoch: 2
train loss 0.885, val loss 1.022, val accuracy 0.638
epoch: 3
train loss 0.713, val loss 1.046, val accuracy 0.711
epoch: 4
train loss 0.665, val loss 1.548, val accuracy 0.605
epoch: 5
train loss 0.590, val loss 0.960, val accuracy 0.719
epoch: 6
train loss 0.406, val loss 1.039, val accuracy 0.739
epoch: 7
train loss 0.351, val loss 1.009, val accuracy 0.747
epoch: 8
train loss 0.281, val loss 1.092, val accuracy 0.751
epoch: 9
train loss 0.247, val loss 1.123, val accuracy 0.754
epoch: 10
train loss 0.191, val loss 1.214, val accuracy 0.744
epoch: 11
train loss 0.204, val loss 1.217, val accuracy 0.753
epoch: 12
train loss 0.148, val loss 1.237, val accuracy 0.757
epoch: 13
train loss 0.147, val loss 1.605, val accuracy 0.736
epoch: 14
train loss 0.164, val loss 1.360, val accuracy 0.755
epoch: 15
train loss 0.129, val loss 1.451, val accuracy 0.739
ep

---------------

GloVe

In [49]:
GLOVE_FILE = os.path.join(PATH + "glove.6B.300d.txt")

def load_glove_vectors(glove_file = GLOVE_FILE):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors


def get_emb_matrix(pretrained, word_counts, emb_size):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [50]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts, EMBEDDING_DIM)

In [51]:
DROPOUT = 0.1

class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(DROPOUT)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [53]:
model_glove = LSTM_glove_vecs(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, pretrained_weights)

In [None]:
train_model(model_glove, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

epoch: 0
train loss 1.695, val loss 1.657, val accuracy 0.216
epoch: 1
train loss 1.703, val loss 1.688, val accuracy 0.191
epoch: 2
train loss 1.693, val loss 1.683, val accuracy 0.216
epoch: 3
train loss 1.691, val loss 1.706, val accuracy 0.208
epoch: 4
train loss 1.687, val loss 1.715, val accuracy 0.198


In [39]:
len(vocab)

17573

In [40]:
VOCAB_SIZE

17573