In [None]:
########################################################################################################################
# Filename: Autoencoder_for_Dimension_Reduction.ipynb
#
# Purpose: To learn an autoencoder for dimension reduction of tf-idf feature vectors
#
# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, tensorflow, bpmll
########################################################################################################################

# Autoencoder Learning for TF-IDF Feature Vectors (Non-linear Dimension Reduction)

In [83]:
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
from tensorflow.keras.models import Model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from skmultilearn.adapt import MLkNN
import sys
sys.path.append('../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary    ## Import the 'predict_test_labels_binary()' function from the 

In [8]:
## Load 'content_paragraphs_ready.csv' into a pandas dataframe
data_filepath = "..\..\dataset\content_paragraphs_ready.csv"
paragraph_data = pd.read_csv(data_filepath)
paragraph_data.head()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,...,prosper,preview,moor,coverag,glow,profil,clash,incumb,frequent,unfound
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
## Keep only 'doc_id', the label columns, and the tf-idf columns
tfidf_colnames = list(paragraph_data.columns[25:])
label_columns = list(paragraph_data.columns[2:15])
cols_toKeep = ['doc_id']
cols_toKeep.extend(tfidf_colnames)
cols_toKeep.extend(label_columns)
paragraph_data = paragraph_data[cols_toKeep]
paragraph_data.head()

Unnamed: 0,doc_id,murphi,martha,defacto,stay,home,state,million,us,you�,...,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,economy,education,political evaluation,racial conflict,international ralations/foreign policies
0,text1,1.684247,1.348455,2.161368,3.118616,3.016311,0.91833,1.207125,1.383217,1.763428,...,1,1,0,1,1,0,0,0,0,0
1,text2,1.684247,1.348455,0.0,0.0,0.0,3.67332,0.0,0.0,1.763428,...,1,1,0,1,1,0,0,0,0,0
2,text3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,0
3,text4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,1,1,0,0,0,0,0
4,text5,1.684247,1.348455,0.0,0.0,0.0,0.0,0.0,1.383217,0.0,...,0,0,1,0,1,0,0,0,0,0


In [20]:
## Define the X and Y train and test matrices
X = paragraph_data[tfidf_colnames].to_numpy().astype(float)
Y = paragraph_data[label_columns].to_numpy().astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 321)

In [93]:
X_train.shape[1]

2094

In [96]:
latent_dim = 62
visible_dim = X.shape[1]

class Autoencoder(Model):
    def __init__(self, latent_dim, visible_dim):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim   
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Dense(latent_dim * 2, activation = 'relu'),
            tf.keras.layers.Dense(latent_dim, activation = 'relu'),
        ])
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Dense(latent_dim * 2, activation = 'relu'),
            tf.keras.layers.Dense(visible_dim, activation = 'sigmoid'),
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = Autoencoder(latent_dim, visible_dim)

In [97]:
autoencoder.compile(optimizer = 'adam', loss = tf.keras.losses.MeanSquaredError())

In [105]:
tf.random.set_seed(123)
autoencoder.fit(X_train, X_train,
                epochs = 200,
                shuffle = True,
                validation_data = (X_test, X_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x16833950cd0>

In [112]:
encoded_tfidfs = autoencoder.encoder(X).numpy()
decoded_tfidfs = autoencoder.decoder(encoded_tfidfs).numpy()

In [113]:
encoded_tfidfs.shape

(290, 62)

In [114]:
## Write the encoded data to a csv file
encoded_df = pd.DataFrame(encoded_tfidfs)
encoded_df.to_csv('encoded_tfidfs.csv', index = False)

## Use Encoded Data for ML-KNN

In [107]:
encoded_train, encoded_test, Y_train, Y_test = train_test_split(encoded_tfidfs, Y, test_size = 0.33, random_state = 321)

In [108]:
classifier = MLkNN(k = 3)
classifier.fit(encoded_train, Y_train)
y_train_pred = classifier.predict(encoded_train)
y_train_pred_array = y_train_pred.toarray()
y_test_pred = classifier.predict(encoded_test)
y_test_pred_array = y_test_pred.toarray()

print (f"The Hamming loss for the training data is {metrics.hamming_loss(Y_train, y_train_pred_array):.3f}")
print (f"The Hamming loss for the test data is {metrics.hamming_loss(Y_test, y_test_pred_array):.3f}")



The Hamming loss for the training data is 0.093
The Hamming loss for the test data is 0.174


In [109]:
%%capture
from sklearn.model_selection import GridSearchCV

parameters = {'k': range(1,5), 's': [1.0, 1.5, 2.5, 5.0, 10.0]}  
#By default, the Hamming loss as an option is not provided in the scoring string options. So, we will make the Hamming loss funciton as a scorer and use that. 
hamming_scorer = metrics.make_scorer(metrics.hamming_loss)

clf = GridSearchCV(MLkNN(), parameters, scoring = hamming_scorer, cv = 5, verbose = 1)
clf.fit(encoded_train, Y_train)

In [110]:
best_index = np.argmin(clf.cv_results_["mean_test_score"])
best_parameters = clf.cv_results_["params"][best_index]

df_CV = pd.DataFrame(columns=["Params", "Mean out-of-bag Hamming loss"])
df_CV["Params"] = clf.cv_results_["params"]
df_CV[ "Mean out-of-bag Hamming loss"] = clf.cv_results_["mean_test_score"]
display(df_CV)
print(best_parameters, np.min(clf.cv_results_['mean_test_score']))

Unnamed: 0,Params,Mean out-of-bag Hamming loss
0,"{'k': 1, 's': 1.0}",0.192723
1,"{'k': 1, 's': 1.5}",0.192723
2,"{'k': 1, 's': 2.5}",0.192723
3,"{'k': 1, 's': 5.0}",0.192723
4,"{'k': 1, 's': 10.0}",0.192723
5,"{'k': 2, 's': 1.0}",0.20381
6,"{'k': 2, 's': 1.5}",0.201837
7,"{'k': 2, 's': 2.5}",0.20026
8,"{'k': 2, 's': 5.0}",0.188394
9,"{'k': 2, 's': 10.0}",0.183671


{'k': 3, 's': 10.0} 0.1709643932315997


In [111]:
#Threshold learning
#using the best parameters from the cross-validation with original threshold. 
classifier_best = MLkNN(k = 3, s = 10)
classifier_best.fit(encoded_train, Y_train)
y_train_pred_best = classifier_best.predict(encoded_train)
y_train_pred_best_array = y_train_pred_best.toarray()
y_test_pred_best = classifier_best.predict(encoded_test)
y_test_pred_best_array = y_test_pred_best.toarray()

print (f"Best parameters: The Hamming loss training data is {metrics.hamming_loss(Y_train, y_train_pred_best_array):.3f}")
print (f"Best parameters: The Hamming loss test data is {metrics.hamming_loss(Y_test, y_test_pred_best_array):.3f}")

#using the best parameters
classifier_CV = MLkNN(k = 3, s = 2.5)
classifier_CV.fit(encoded_train, Y_train)
y_train_pred_proba = classifier_CV.predict_proba(encoded_train)
y_train_pred_proba_array = y_train_pred_proba.toarray()
y_test_pred_proba = classifier_CV.predict_proba(encoded_test)
y_test_pred_proba_array = y_test_pred_proba.toarray()

t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(y_train_pred_proba_array, Y_train, y_test_pred_proba_array, t_range)
print (f"Best parameters with threshold function learning: Hamming loss Test set is {metrics.hamming_loss(Y_test, test_labels_binary)}")



Best parameters: The Hamming loss training data is 0.103
Best parameters: The Hamming loss test data is 0.175




Best parameters with threshold function learning: Hamming loss Test set is 0.21394230769230768
