In [131]:
COLAB = True

In [132]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import spacy
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from collections import Counter
from google.colab import drive
import os

In [133]:
RANDOM_STATE = 30255
BATCH_SIZE = 32
MAX_LENGTH = 512
NUM_EPOCHS = 30
LEARNING_RATE = 0.001

In [134]:
# df = pd.read_csv('../data/preprocessed_data.csv')
# df = df[['CLASS', 'SPACY_PREPROCESSED']]
# df = df.dropna()
# # df['PREPROCESSED'] = df['PREPROCESSED'].str.replace(r'<[^<>]*>', '', regex=True) # drop HTML tags


# from sklearn import preprocessing

# le = preprocessing.LabelEncoder()
# le.fit(df['CLASS'])
# df['LABEL'] = le.transform(df['CLASS'])

# display(df['CLASS'].value_counts())

# # shuffle
# df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index()
# df = df[['LABEL', 'SPACY_PREPROCESSED']]

# display(df['LABEL'].value_counts(dropna=False))

# display(df.head())

In [135]:
if COLAB:
  drive.mount('/content/gdrive')
  PATH = "gdrive/Shareddrives/Adv ML Project/Data/"
  df = pd.read_csv(os.path.join(PATH + "preprocessed_data.csv"))

else:
  df = pd.read_csv('../data/preprocessed_data.csv')
  df = df.sample(n=100, random_state=RANDOM_STATE).reset_index()


le = preprocessing.LabelEncoder()
le.fit(df['CLASS'])
df['LABEL'] = le.transform(df['CLASS'])

df.head()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Unnamed: 0,DESCRIPTION,SUBJECT,MAIN_SUBJECT,CLASS,BERT_TOKENIZED,SPACY_PREPROCESSED,LABEL
0,The United States Department of Energy Vehicle...,"['33 Advanced Propulsion Systems', '36 Materia...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2142, 2...",united states department energy vehicle techno...,0
1,Solar reflective “cool pavements” have been pr...,"['32 Energy Conservation, Consumption, And Uti...","32 Energy Conservation, Consumption, And Utili...","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 5943, 21346, 1...",solar reflective cool pavement propose potenti...,0
2,Inconel 718 alloy is used extensively in aerog...,"['36 Materials Science', '33 Advanced Propulsi...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 4297, 5643, 2...",inconel alloy extensively aerogas turbine allo...,0
3,The Production Tax Credit (PTC) and the Invest...,"['29 Energy Planning, Policy, And Economy', 'P...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc investment tax credi...,0
4,The production tax credit (PTC) promotes wind ...,"['29 Energy Planning, Policy, And Economy', '1...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc promote wind energy ...,0


In [136]:
display(df[['CLASS', 'LABEL']].drop_duplicates())

Unnamed: 0,CLASS,LABEL
0,"Energy Storage, Conversion, and Utilization",0
1223,Environmental Sciences,1
2446,Fission and Nuclear Technologies,2
3669,Fossil Fuels,3
4892,Renewable Energy Sources,4


In [137]:
tok = spacy.load('en_core_web_sm')

In [138]:
def tokenize (text):
    return [token.text for token in tok.tokenizer(text)]

In [139]:
#count number of occurences of each word
counts = Counter()
for index, row in df.iterrows():
    counts.update(tokenize(row['SPACY_PREPROCESSED']))

In [140]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 30719
num_words after: 17571


In [141]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [142]:
# encode from scratch
def encode_sentence(text, vocab2index, N=MAX_LENGTH):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [143]:
df['encoded'] = df['SPACY_PREPROCESSED'].apply(lambda x: np.array(encode_sentence(x,vocab2index ), dtype=object))
df.head()

Unnamed: 0,DESCRIPTION,SUBJECT,MAIN_SUBJECT,CLASS,BERT_TOKENIZED,SPACY_PREPROCESSED,LABEL,encoded
0,The United States Department of Energy Vehicle...,"['33 Advanced Propulsion Systems', '36 Materia...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2142, 2...",united states department energy vehicle techno...,0,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,Solar reflective “cool pavements” have been pr...,"['32 Energy Conservation, Consumption, And Uti...","32 Energy Conservation, Consumption, And Utili...","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 5943, 21346, 1...",solar reflective cool pavement propose potenti...,0,"[[87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, ..."
2,Inconel 718 alloy is used extensively in aerog...,"['36 Materials Science', '33 Advanced Propulsi...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 4297, 5643, 2...",inconel alloy extensively aerogas turbine allo...,0,"[[179, 180, 181, 1, 182, 180, 183, 184, 185, 1..."
3,The Production Tax Credit (PTC) and the Invest...,"['29 Energy Planning, Policy, And Economy', 'P...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc investment tax credi...,0,"[[56, 248, 249, 250, 251, 248, 249, 252, 253, ..."
4,The production tax credit (PTC) promotes wind ...,"['29 Energy Planning, Policy, And Economy', '1...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc promote wind energy ...,0,"[[56, 248, 249, 250, 267, 256, 5, 268, 101, 26..."


In [144]:
Counter(df['CLASS'])

Counter({'Energy Storage, Conversion, and Utilization': 1223,
         'Environmental Sciences': 1223,
         'Fission and Nuclear Technologies': 1223,
         'Fossil Fuels': 1223,
         'Renewable Energy Sources': 1223})

In [145]:
X = list(df['encoded'])
y = list(df['LABEL'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      train_size=0.7, test_size=0.15, 
                                                      random_state=RANDOM_STATE,
                                                      shuffle=True)

In [146]:
class ResearchDataSet(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [147]:
train_ds = ResearchDataSet(X_train, y_train)
valid_ds = ResearchDataSet(X_valid, y_valid)

In [148]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

def train_model(model, epochs=NUM_EPOCHS, lr=LEARNING_RATE):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)

    # initialize list
    val_loss_list = []
    val_acc_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    c_mat_list = []

    # train over epochs
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, precision_score, recall_score, f1_score, c_mat = validation_metrics(model, val_dl)
        
        # update lists
        val_loss_list.append(val_loss)
        val_acc_list.append(val_acc)
        precision_list.append(precision_score)
        recall_list.append(recall_score)
        f1_list.append(f1_score)
        c_mat_list.append(c_mat)

        # print results
        if i % 1 == 0:
            print(
                "epoch: %.0f\ntrain loss %.3f, val loss %.3f, val accuracy %.3f, precision %.3f, recall %.3f, f1 %.3f" % (i, 
                                                                                                                            sum_loss/total, 
                                                                                                                            val_loss, 
                                                                                                                            val_acc, 
                                                                                                                            precision_score, 
                                                                                                                            recall_score, 
                                                                                                                            f1_score))
        if i == NUM_EPOCHS - 1:
            print("\nconfusion matrix:\n", np.round(c_mat, decimals=1))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    true_labels = []
    predicted_labels = []
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        true_labels.extend(y.tolist())
        predicted_labels.extend(pred.tolist())
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    cm = confusion_matrix(true_labels, predicted_labels)
    cm = (cm / cm.sum(axis=1)) * 100 
    return sum_loss/total, correct/total, precision, recall, f1, cm

In [149]:
VOCAB_SIZE = len(words)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE)

LSTM with Fixed-Length Input

In [150]:
DROPOUT = 0.1

class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(DROPOUT)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [151]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 512
# model_fixed =  LSTM_fixed_len(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM)

In [152]:
# train_model(model_fixed, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

LSTM with Variable-Length Input

In [153]:
DROPOUT = 0.3

class LSTM_variable_input(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(DROPOUT)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [154]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 128
model_variable =  LSTM_variable_input(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM)

# worked:
# EMBEDDING_DIM = 300
# HIDDEN_DIM = 256
# RANDOM_STATE = 30255
# BATCH_SIZE = 32
# MAX_LENGTH = 512
# NUM_EPOCHS = 50
# LEARNING_RATE = 0.01
# DROPOUT = 0.1

In [155]:
train_model(model_variable, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

epoch: 0
train loss 1.523, val loss 1.395, val accuracy 0.434, precision 0.431, recall 0.434, f1 0.429
epoch: 1
train loss 1.229, val loss 1.108, val accuracy 0.593, precision 0.592, recall 0.593, f1 0.580
epoch: 2
train loss 0.965, val loss 1.063, val accuracy 0.629, precision 0.649, recall 0.629, f1 0.611
epoch: 3
train loss 0.829, val loss 1.242, val accuracy 0.624, precision 0.669, recall 0.624, f1 0.618
epoch: 4
train loss 0.865, val loss 1.093, val accuracy 0.587, precision 0.612, recall 0.587, f1 0.581
epoch: 5
train loss 0.755, val loss 1.020, val accuracy 0.658, precision 0.664, recall 0.658, f1 0.655
epoch: 6
train loss 0.612, val loss 1.097, val accuracy 0.670, precision 0.680, recall 0.670, f1 0.665
epoch: 7
train loss 0.521, val loss 1.001, val accuracy 0.703, precision 0.705, recall 0.703, f1 0.701
epoch: 8
train loss 0.458, val loss 1.060, val accuracy 0.698, precision 0.699, recall 0.698, f1 0.696
epoch: 9
train loss 0.388, val loss 1.094, val accuracy 0.715, precision 

---------------

GloVe

In [49]:
GLOVE_FILE = os.path.join(PATH + "glove.6B.300d.txt")

def load_glove_vectors(glove_file = GLOVE_FILE):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors


def get_emb_matrix(pretrained, word_counts, emb_size):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [50]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts, EMBEDDING_DIM)

In [51]:
DROPOUT = 0.1

class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(DROPOUT)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [53]:
model_glove = LSTM_glove_vecs(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, pretrained_weights)

In [None]:
train_model(model_glove, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

epoch: 0
train loss 1.695, val loss 1.657, val accuracy 0.216
epoch: 1
train loss 1.703, val loss 1.688, val accuracy 0.191
epoch: 2
train loss 1.693, val loss 1.683, val accuracy 0.216
epoch: 3
train loss 1.691, val loss 1.706, val accuracy 0.208
epoch: 4
train loss 1.687, val loss 1.715, val accuracy 0.198


In [39]:
len(vocab)

17573

In [40]:
VOCAB_SIZE

17573