# Libraries

In [1]:
# Libraries
import numpy as np
import pandas as pd

import torch

from transformers import RobertaConfig
import tensorflow_hub as hub
import tensorflow_text

In [2]:
# Uncomment the below code for necessary downloads
#
#!pip install tensorflow_text

# Check for GPU

In [3]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


# Import data

In [4]:
# Import data
data = pd.read_csv("../data/preprocess_data.csv")
data

Unnamed: 0,title,genre,summary,word_count,title_summary
0,drowned wednesday,fantasy,drowned wednesday first trustee among morrow d...,803,drowned wednesday drowned wednesday first trus...
1,lost hero,fantasy,book opens jason awakens school unable remembe...,563,lost hero book open jason awakens school unabl...
2,eyes overworld,fantasy,cugel easily persuaded merchant fianosther att...,334,eye overworld cugel easily persuaded merchant ...
3,magic promise,fantasy,book opens herald mage vanyel returning countr...,776,magic promise book open herald mage vanyel ret...
4,taran wanderer,fantasy,taran gurgi returned caer dallben following ev...,1190,taran wanderer taran gurgi returned caer dallb...
...,...,...,...,...,...
3097,wintersmith,fantasy,tiffany aching trainee witch working seriously...,132,wintersmith tiffany aching trainee witch worki...
3098,fantastic beasts find original screenplay,fantasy,rowling screenwriting debut captured exciting ...,117,fantastic beast find original screenplay rowli...
3099,hounded,fantasy,atticus sullivan last druids lives peacefully ...,144,hounded atticus sullivan last druid life peace...
3100,rising,fantasy,live dream children born free says like land f...,244,rising live dream child born free say like lan...


# BERT Model

In [5]:
num_labels = data.genre.nunique()

# Initializing a RoBERTa configuration
configuration = RobertaConfig()
configuration.num_labels = num_labels

In [6]:
sentences = data['title_summary']

In [7]:
# bert preprocessorhttps://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

# bert encoder: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/2
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/2",trainable=True)



In [8]:
# preprocessing dataset adding cls sep etc 
inputs = preprocessor(sentences)

# feeding it to model for vectorization
outputs = encoder(inputs)

In [9]:
# Check the shape of sequence outputs
outputs['sequence_output'].shape

TensorShape([3102, 128, 512])

In [10]:
# Create data frame for bert features
bert = pd.DataFrame()

In [11]:
for i in range(0, len(outputs['sequence_output'])):
    so = outputs['sequence_output'][i].numpy().sum(axis=0)
    bert = bert.append(pd.Series(so),ignore_index=True)
    
print('values added in dataframe')
     

  bert = bert.append(pd.Series(so),ignore_index=True)
  bert = bert.append(pd.Series(so),ignore_index=True)


values added in dataframe


In [12]:
bert

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,37.146961,-0.718577,-73.298584,-67.357712,-53.100407,21.316463,41.753353,32.956703,0.400308,-6.732258,...,-42.584038,-44.468742,-58.364819,53.291164,4.001782,54.231129,65.188087,84.791145,41.844978,53.529175
1,33.885796,13.979010,-85.719353,-30.451246,-22.090153,4.564034,31.165920,46.367527,-45.197884,-4.584962,...,-70.876808,-29.125689,-56.661724,68.347633,-14.737951,47.097118,39.724976,132.571991,53.459076,32.982380
2,16.787161,29.406092,-48.264774,-38.957745,-64.305206,38.198917,5.805443,29.524828,-17.051638,-50.592262,...,-86.319290,-12.413198,-68.498245,68.740623,15.706626,7.832871,24.119993,63.330021,-2.067029,37.960247
3,35.850006,42.962036,-75.509804,-48.785091,-41.254383,-1.976429,-6.982839,15.469737,-31.976854,-40.112244,...,-71.183754,-40.468624,-63.892910,65.396103,-26.214056,62.399139,22.835871,70.418625,57.963970,40.734089
4,29.469852,34.854473,-71.579407,-56.628281,-25.895311,10.295429,3.647528,6.661787,-56.010277,-16.745693,...,-95.782936,-47.623943,-66.134270,49.544071,-15.663713,82.616539,51.079155,34.097988,34.966217,50.609196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3097,40.814007,52.365459,-66.301216,-5.888998,-25.481979,12.881906,-20.610170,29.777679,-70.850876,-23.516727,...,-95.068092,-3.134943,-21.826483,52.187405,4.000219,-8.243583,61.727478,65.101852,13.242323,-11.598402
3098,79.484070,14.683929,-55.400757,-36.410702,-51.086018,22.655785,23.772587,18.880114,-26.694736,-28.209026,...,-95.518692,-6.570502,-43.297688,36.865738,-26.593466,-11.854276,15.762242,30.530457,-21.491936,6.241181
3099,34.106094,46.009880,-107.840492,-21.282597,-41.868919,19.889317,27.115509,53.853207,-42.030037,-11.185226,...,-100.019051,-14.799921,-43.575123,80.095276,-46.723160,51.911724,5.770129,22.068426,4.152709,21.077639
3100,31.388390,6.374999,-67.875259,-34.819275,-34.774792,49.245262,7.877729,72.808311,-33.193504,-47.335880,...,-97.020836,-23.411839,-46.369392,119.432793,-8.986543,49.455700,-7.393787,114.400398,29.348709,32.622131


# Export Bert Features

In [13]:
bert.to_csv( "data/bertFeatures_1.csv", index=False, encoding='utf-8-sig')