In [7]:
COLAB = True

In [8]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import spacy
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from collections import Counter
from google.colab import drive
import os

In [9]:
RANDOM_STATE = 30255
NUM_EPOCHS = 7

if COLAB:
  drive.mount('/content/gdrive')
  PATH = "gdrive/Shareddrives/Adv ML Project/Data/"
  df = pd.read_csv(os.path.join(PATH + "preprocessed_data.csv"))

else:
  df = pd.read_csv('../data/preprocessed_data.csv')
  df = df.sample(n=100, random_state=RANDOM_STATE).reset_index()


le = preprocessing.LabelEncoder()
le.fit(df['CLASS'])
df['LABEL'] = le.transform(df['CLASS'])

df.head()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Unnamed: 0,DESCRIPTION,SUBJECT,MAIN_SUBJECT,CLASS,BERT_TOKENIZED,SPACY_PREPROCESSED,LABEL
0,The United States Department of Energy Vehicle...,"['33 Advanced Propulsion Systems', '36 Materia...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2142, 2...",united states department energy vehicle techno...,0
1,Solar reflective “cool pavements” have been pr...,"['32 Energy Conservation, Consumption, And Uti...","32 Energy Conservation, Consumption, And Utili...","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 5943, 21346, 1...",solar reflective cool pavement propose potenti...,0
2,Inconel 718 alloy is used extensively in aerog...,"['36 Materials Science', '33 Advanced Propulsi...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 4297, 5643, 2...",inconel alloy extensively aerogas turbine allo...,0
3,The Production Tax Credit (PTC) and the Invest...,"['29 Energy Planning, Policy, And Economy', 'P...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc investment tax credi...,0
4,The production tax credit (PTC) promotes wind ...,"['29 Energy Planning, Policy, And Economy', '1...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc promote wind energy ...,0


In [10]:
tmp_dict = df[['CLASS', 'LABEL']].drop_duplicates().set_index('LABEL').to_dict('index')
CATEGORY_DICT = {label: sub_dict['CLASS'] for label, sub_dict in tmp_dict.items()}
list(CATEGORY_DICT.values())

['Energy Storage, Conversion, and Utilization',
 'Environmental Sciences',
 'Fission and Nuclear Technologies',
 'Fossil Fuels',
 'Renewable Energy Sources']

In [11]:

import torch.optim as optim

# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cuda


In [12]:
df['tokens'] = df['SPACY_PREPROCESSED'].apply(lambda x: x.split())

In [13]:
X = df['tokens']
y =df['LABEL']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      train_size=0.7, test_size=0.15, 
                                                      random_state=RANDOM_STATE,
                                                      shuffle=True)

X_train = X_train.reset_index()
X_valid = X_valid.reset_index()
y_train = y_train.to_frame()
y_train = y_train.reset_index()
y_valid = y_valid.to_frame()
y_valid = y_valid.reset_index()

In [14]:
from gensim import corpora
# Function to return the dictionary either with padding word or without padding
def make_dict(df, padding=True):
    if padding:
        print("Dictionary with padded token added")
        review_dict = corpora.Dictionary([['pad']])
        review_dict.add_documents(df['tokens'])
    else:
        print("Dictionary without padding")
        review_dict = corpora.Dictionary(df['tokens'])
    return review_dict

# Make the dictionary without padding for the basic models
review_dict = make_dict(df, padding=False)

Dictionary without padding


In [15]:
# Function to get the output tensor
def make_target(label):
    if label == 0:
        return torch.tensor([0], dtype=torch.long, device=device)
    elif label == 1:
        return torch.tensor([1], dtype=torch.long, device=device)
    elif label == 2:
        return torch.tensor([2], dtype=torch.long, device=device)
    elif label == 3:
        return torch.tensor([3], dtype=torch.long, device=device)
    else:
        return torch.tensor([4], dtype=torch.long, device=device)

In [16]:
VOCAB_SIZE = len(review_dict)
NUM_LABELS = 5

# Function to make bow vector to be used as input to network
def make_bow_vector(review_dict, sentence):
    vec = torch.zeros(VOCAB_SIZE, dtype=torch.float64, device=device)
    for word in sentence:
        vec[review_dict.token2id[word]] += 1
    return vec.view(1, -1).float()

In [17]:
# Defining neural network structure
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # needs to be done everytime in the nn.module derived class
        super(BoWClassifier, self).__init__()

        # Define the parameters that are needed for linear model ( Ax + b)
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec): # Defines the computation performed at every call.
        # Pass the input through the linear layer,
        # then pass that through log_softmax.

        return F.log_softmax(self.linear(bow_vec), dim=1)

In [18]:
#  Initialize the model
bow_nn_model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)
bow_nn_model.to(device)

# Loss Function
loss_function = nn.NLLLoss()
# Optimizer initlialization
optimizer = optim.SGD(bow_nn_model.parameters(), lr=0.01)

In [19]:
import time
from sklearn.metrics import classification_report

def validation_function(bow_nn_model, X_valid, y_valid):
  bow_nn_predictions = []
  original_labels = []
  start_time = time.time()

  with torch.no_grad():
      for index, row in X_valid.iterrows():
          bow_vec = make_bow_vector(review_dict, row['tokens'])
          probs = bow_nn_model(bow_vec)
          bow_nn_predictions.append(torch.argmax(probs, dim=1).cpu().numpy()[0])
          original_labels.append(make_target(y_valid['LABEL'][index]).cpu().numpy()[0])
  print(classification_report(original_labels, bow_nn_predictions, target_names=list(CATEGORY_DICT.values())))
  print("Time taken to predict: " + str(time.time() - start_time))

In [20]:
import time
start_time = time.time()


train_loss_history = []
train_acc_history = []
valid_loss_history = []
valid_acc_history = []
valid_precision_history = []
valid_recall_history = []
valid_f1_history = []

# Train the model
for epoch in range(NUM_EPOCHS):

    for index, row in X_train.iterrows():
        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        bow_nn_model.zero_grad()

        # Step 2. Make BOW vector for input features and target label
        bow_vec = make_bow_vector(review_dict, row['tokens'])
        target = make_target(y_train['LABEL'][index])

        # Step 3. Run the forward pass.
        probs = bow_nn_model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()

    # Evaluate after each epoch
    validation_function(bow_nn_model, X_valid, y_valid)
        

print("Time taken to train the model: " + str(time.time() - start_time))

                                             precision    recall  f1-score   support

Energy Storage, Conversion, and Utilization       0.85      0.70      0.77       185
                     Environmental Sciences       0.68      0.84      0.75       182
           Fission and Nuclear Technologies       0.88      0.82      0.85       191
                               Fossil Fuels       0.77      0.79      0.78       174
                   Renewable Energy Sources       0.71      0.71      0.71       186

                                   accuracy                           0.77       918
                                  macro avg       0.78      0.77      0.77       918
                               weighted avg       0.78      0.77      0.77       918

Time taken to predict: 1.8576514720916748
                                             precision    recall  f1-score   support

Energy Storage, Conversion, and Utilization       0.89      0.67      0.77       185
                   

In [22]:
CATEGORY_DICT

{0: 'Energy Storage, Conversion, and Utilization',
 1: 'Environmental Sciences',
 2: 'Fission and Nuclear Technologies',
 3: 'Fossil Fuels',
 4: 'Renewable Energy Sources'}