# Libraries

In [1]:
# Libraries
import numpy as np
import pandas as pd

import torch

from transformers import RobertaConfig
import tensorflow_hub as hub
import tensorflow_text

In [2]:
# Uncomment the below code for necessary downloads
#
#!pip install tensorflow_text

# Check for GPU

In [3]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


# Import data

In [13]:
# Import data
data = pd.read_csv("../data/clean_data.csv")
data

Unnamed: 0,title,genre,summary,word_count
0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,803
1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",563
2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,334
3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,776
4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,1190
...,...,...,...,...
3097,Wintersmith,fantasy,Tiffany Aching is a trainee witch — now workin...,132
3098,Fantastic Beasts and Where to Find Them: The O...,fantasy,J.K. Rowling's screenwriting debut is captured...,117
3099,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",144
3100,Red Rising,fantasy,"""I live for the dream that my children will be...",244


# RoBERTa Model

In [14]:
num_labels = data.genre.nunique()

# Initializing a RoBERTa configuration
configuration = RobertaConfig()
configuration.num_labels = num_labels

In [15]:
data['title_summary'] = data['title'].str.cat(data['summary'], sep = " ")
data.head()

Unnamed: 0,title,genre,summary,word_count,title_summary
0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,803,Drowned Wednesday Drowned Wednesday is the fi...
1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",563,"The Lost Hero As the book opens, Jason awaken..."
2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,334,The Eyes of the Overworld Cugel is easily per...
3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,776,Magic's Promise The book opens with Herald-Ma...
4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,1190,Taran Wanderer Taran and Gurgi have returned ...


In [16]:
sentences = data['title_summary']

In [17]:
# bert preprocessor
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

# bert encoder
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/2",
    trainable=True)

In [18]:
# preprocessing dataset adding cls sep etc 
inputs = preprocessor(sentences)

# feeding it to model for vectorization
outputs = encoder(inputs)

In [19]:
# Check the shape of sequence outputs
outputs['sequence_output'].shape

TensorShape([3102, 128, 512])

In [20]:
# Create data frame for bert features
roberta = pd.DataFrame()

In [21]:
for i in range(0, len(outputs['sequence_output'])):
    so = outputs['sequence_output'][i].numpy().sum(axis=0)
    roberta = roberta.append(pd.Series(so),ignore_index=True)
    
print('values added in dataframe')
     

  roberta = roberta.append(pd.Series(so),ignore_index=True)
  roberta = roberta.append(pd.Series(so),ignore_index=True)


values added in dataframe


In [22]:
roberta

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,4.316272,15.405085,24.828201,-75.172447,-13.534739,-38.382740,54.859394,26.104355,-4.333909,-25.569908,...,-20.471935,-38.419376,-20.355982,33.418911,-29.044176,65.156868,-25.409231,31.045004,-20.738894,-0.894735
1,34.325665,36.789608,37.560623,-95.008598,-22.788834,-43.718056,71.762115,11.447465,10.506699,-27.725185,...,-53.692505,-28.026245,-34.173134,29.404604,-47.929157,38.367733,-11.220082,25.254166,-60.831356,-14.401155
2,-3.369848,14.546645,6.234241,-78.272842,-48.876003,1.956227,18.201977,53.883606,13.195460,-45.359123,...,-59.398029,-17.912430,-13.861598,58.917458,-27.803986,36.693584,-10.689712,24.056469,-47.536369,-13.605616
3,4.914439,37.100975,-27.261402,-125.001122,-28.405718,-18.743832,31.395061,51.983032,-21.138748,-50.854324,...,-56.199390,-34.378658,-27.659170,36.396881,-32.298470,76.720146,-4.331016,14.262194,-22.958921,7.181437
4,19.962574,31.626289,6.389140,-76.193504,-5.816837,-12.457826,40.327103,2.182704,-26.150543,-26.778254,...,-66.749634,-37.420101,-9.587160,36.043625,-24.721464,70.516815,24.686543,-13.676856,-13.684598,15.559875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3097,-11.225694,40.429478,10.012919,-35.114063,-23.101091,4.155180,-6.279691,54.753826,-30.134373,-53.629410,...,-62.222977,-26.613688,-6.364028,68.671242,-83.688828,40.904568,18.269629,18.572840,-74.060242,-13.316795
3098,38.573376,7.380617,1.204027,-68.112419,-23.102394,-36.635128,31.093042,53.848316,-18.717997,-36.549927,...,-74.665657,6.009476,-6.405493,26.117977,-59.971863,35.267303,-28.908421,9.282600,-76.217476,5.601767
3099,3.663643,33.316494,-14.428480,-91.111931,-22.564718,-12.256862,66.230904,50.508030,-3.872550,-45.317486,...,-45.422516,-32.452629,-21.883177,40.749496,-69.189621,46.581120,-27.960026,-47.814022,-59.681942,-4.720864
3100,13.460938,25.004812,4.577496,-73.830116,-51.823128,-32.898266,50.609035,81.129044,-11.318274,-62.635960,...,-45.635895,-28.428459,9.438528,84.751801,-24.228636,45.896317,-50.521488,36.762486,-29.728704,18.599329


# Export Bert Features

In [23]:
roberta.to_csv( "../data/robertaFeaturesCleanData.csv", index=False, encoding='utf-8-sig')