# Libraries

In [1]:
# Libraries
import numpy as np
import pandas as pd

import torch

from transformers import RobertaConfig
import tensorflow_hub as hub
import tensorflow_text

In [2]:
# Uncomment the below code for necessary downloads
#
#!pip install tensorflow_text

# Check for GPU

In [3]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


# Import data

In [4]:
# Import data
data = pd.read_csv("data/preprocess_data.csv")
data

Unnamed: 0,title,genre,summary,word_count
0,Drowned Wednesday,fantasy,drown wednesday first truste among morrow day ...,803
1,The Lost Hero,fantasy,book open jason awaken school unabl rememb any...,563
2,The Eyes of the Overworld,fantasy,cugel easili persuad merchant fianosth attempt...,334
3,Magic's Promise,fantasy,book open herald mage vanyel return countri va...,776
4,Taran Wanderer,fantasy,taran gurgi return caer dallben follow event t...,1190
...,...,...,...,...
3097,Wintersmith,fantasy,tiffani ach traine witch work serious scari mi...,132
3098,Fantastic Beasts and Where to Find Them: The O...,fantasy,rowl screenwrit debut captur excit hardcov edi...,117
3099,Hounded,fantasy,atticus sullivan last druid life peac arizona ...,144
3100,Red Rising,fantasy,live dream child born free say like land fathe...,244


# BERT Model

In [5]:
num_labels = data.genre.nunique()

# Initializing a RoBERTa configuration
configuration = RobertaConfig()
configuration.num_labels = num_labels

In [6]:
sentences = data['summary']

In [7]:
# bert preprocessorhttps://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

# bert encoder: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/2
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/2",trainable=True)



In [None]:
# preprocessing dataset adding cls sep etc 
inputs = preprocessor(sentences)

# feeding it to model for vectorization
outputs = encoder(inputs)

In [9]:
# Check the shape of sequence outputs
outputs['sequence_output'].shape

TensorShape([3102, 128, 512])

In [10]:
# Create data frame for bert features
bert = pd.DataFrame()

In [11]:
for i in range(0, len(outputs['sequence_output'])):
  so = outputs['sequence_output'][i].numpy().sum(axis=0)
  bert = bert.append(pd.Series(so),ignore_index=True)
    
print('values added in dataframe')
     

  bert = bert.append(pd.Series(so),ignore_index=True)
  bert = bert.append(pd.Series(so),ignore_index=True)


values added in dataframe


In [13]:
bert

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,34.064705,22.735920,-76.582848,-54.817326,-36.221695,3.004882,32.512608,-11.660388,-22.189047,-14.774579,...,-58.249287,-40.098869,-51.064007,43.838612,40.184414,46.617764,57.307362,79.453125,67.457970,60.330944
1,44.634422,53.019367,-75.831757,-3.843476,-38.149925,-10.051599,43.763599,-7.165840,-38.409206,-16.900696,...,-70.121567,-14.141790,-73.912643,53.756992,12.076204,27.182837,36.107281,136.759613,56.508595,43.257900
2,23.813639,61.292953,-48.090878,-26.896975,-70.302803,26.154720,37.612103,-13.753791,-5.820881,-10.963672,...,-76.458641,-24.738899,-95.690552,51.725262,13.726544,29.130074,36.723839,34.560719,21.573721,71.547203
3,34.096478,70.980110,-72.184326,-49.600845,-36.909424,-5.688858,13.047146,15.227018,-31.379810,-45.028973,...,-67.968811,-30.079834,-63.493378,41.449932,-11.426052,54.869556,42.769337,90.878685,68.338593,47.883766
4,29.388363,61.366852,-70.836609,-66.867935,-20.526585,5.349851,11.215246,-4.479660,-36.570850,-9.955334,...,-91.066353,-39.021122,-81.629021,37.976974,-0.961639,76.855942,49.196911,42.216511,67.465775,77.140923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3097,12.125149,64.377953,-48.617119,-8.084726,-30.780540,4.701486,30.305477,-26.343121,-69.043503,13.660601,...,-92.400650,-24.641556,-66.916298,35.537956,-6.504138,21.731955,49.664429,81.811951,51.950375,38.102707
3098,30.124966,34.602913,-47.694775,-30.060091,-50.993698,30.277813,57.927208,24.578543,-16.731558,-23.782059,...,-61.238155,-41.059681,-49.547863,46.638039,18.759777,-3.994585,43.995285,56.942993,1.023269,25.830317
3099,29.080410,48.350159,-92.401459,-37.765911,-52.237633,24.671877,30.593655,23.596363,-30.915812,-17.476912,...,-85.544731,-19.823086,-63.061943,58.735950,-36.939617,39.063965,22.339619,45.772694,31.699293,40.172459
3100,27.993219,46.849545,-52.659748,-13.563160,-48.469765,22.498436,46.433445,26.110325,-25.438171,-29.787111,...,-85.879852,-25.784689,-47.455830,54.880508,8.015970,41.336716,12.458059,102.638893,21.027193,46.415066


# Export Bert Features

In [12]:
bert.to_csv( "data/bertFeatures.csv", index=False, encoding='utf-8-sig')