In [None]:
##############################################################################################
# Filename: RBM_dimension_reduction.ipynb
#
# Purpose: To learn a dense, low-dimensional embedding for high-dimensional 
#          tf-idf feature vectors
#
# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, SciPy, RBM_learn
##############################################################################################

# Dimension Reduction for NLP With Restricted Boltzmann Machines

In [2]:
import RBM_learn
import numpy as np
import pandas as pd
import os
import random

In [3]:
## Load 'content_paragraphs_ready.csv' into a pandas dataframe
data_filepath = "..\..\dataset\content_paragraphs_ready.csv"
paragraph_data = pd.read_csv(data_filepath)
paragraph_data.head()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,...,prosper,preview,moor,coverag,glow,profil,clash,incumb,frequent,unfound
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
## Keep only 'doc_id', the label columns, and the tf-idf columns
tfidf_colnames = list(paragraph_data.columns[25:])
label_columns = list(paragraph_data.columns[2:15])
cols_toKeep = ['doc_id']
cols_toKeep.extend(tfidf_colnames)
cols_toKeep.extend(label_columns)
paragraph_data = paragraph_data[cols_toKeep]
paragraph_data.head()

Unnamed: 0,doc_id,murphi,martha,defacto,stay,home,state,million,us,you�,...,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,economy,education,political evaluation,racial conflict,international ralations/foreign policies
0,text1,1.684247,1.348455,2.161368,3.118616,3.016311,0.91833,1.207125,1.383217,1.763428,...,1,1,0,1,1,0,0,0,0,0
1,text2,1.684247,1.348455,0.0,0.0,0.0,3.67332,0.0,0.0,1.763428,...,1,1,0,1,1,0,0,0,0,0
2,text3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,0
3,text4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,1,1,0,0,0,0,0
4,text5,1.684247,1.348455,0.0,0.0,0.0,0.0,0.0,1.383217,0.0,...,0,0,1,0,1,0,0,0,0,0


In [5]:
## Define the X and Y train and test matrices
X = paragraph_data[tfidf_colnames].to_numpy().astype(float)
Y = paragraph_data[label_columns].to_numpy().astype(float)

In [6]:
## Instantiate an RBM object and initialize it's weights
n_v = len(tfidf_colnames)
n_h = 165
rbm_network = RBM_learn.rbm_network(n_v, n_h)
rbm_network.initialize_weights(seed = 123)
rbm_network.W.shape

(2094, 165)

In [None]:
## Train the RBM
learning_rate = 5
tau = 50
num_epochs = 1000
rbm_network.RBM_learn_adaptive(X, learning_rate, num_epochs, tau, verbose = True)

MAE after epoch  1  :  0.9999680777606791
MAE after epoch  2  :  1.000623063123364
MAE after epoch  3  :  1.0031780099679872
MAE after epoch  4  :  1.0067351044908868
MAE after epoch  5  :  1.0075485733957448
MAE after epoch  6  :  1.0072682395354113
MAE after epoch  7  :  1.0069787222542632
MAE after epoch  8  :  1.0065408876295656
MAE after epoch  9  :  1.0063851815506866
MAE after epoch  10  :  1.006040400259253
MAE after epoch  11  :  1.0057777998977042
MAE after epoch  12  :  1.0059000159082927
MAE after epoch  13  :  1.0059466830048842
MAE after epoch  14  :  1.0063353521192933
MAE after epoch  15  :  1.0060864668902576
MAE after epoch  16  :  1.0060710669408819
MAE after epoch  17  :  1.0052843857998714
MAE after epoch  18  :  1.0061621605983138
MAE after epoch  19  :  1.004645245423173
MAE after epoch  20  :  1.006092377188229
MAE after epoch  21  :  1.0030945187140103
MAE after epoch  22  :  1.0055578268148964
MAE after epoch  23  :  1.0002490520860818
MAE after epoch  24  :  