In [61]:
import numpy as np
import pandas as pd

import re
import string

from IPython.display import clear_output

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

In [62]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

clear_output()

In [39]:
data = pd.read_csv("data.csv")

In [40]:
data.head()

Unnamed: 0,index,title,genre,summary
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...


In [41]:
data.genre.value_counts()

thriller      1023
fantasy        876
science        647
history        600
horror         600
crime          500
romance        111
psychology     100
sports         100
travel         100
Name: genre, dtype: int64

In [42]:
data = data.drop(['index'], axis=1)

In [43]:
data.isnull().sum()

title      0
genre      0
summary    0
dtype: int64

In [44]:
data.nunique()

title      4296
genre        10
summary    4542
dtype: int64

## Data Cleaning

In [45]:
#cleaning unecessary text from the string 
def data_clean(text):
    
    # Convert to lowerCase
    text = text.lower() 
    
    # removing punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ',text) 
    
    # Remove stopwords
    text_tokens = word_tokenize(text)
    tw = [word for word in text_tokens if not word in stopwords.words('english')]
    text = (" ").join(tw)
    
    split_text = text.split(' ')
    
    # Remove words with length<=3
    output = [x for x in split_text if len(x) > 3]
    text = (" ").join(output)
    
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text) #removing single character 
    text = re.sub('<.*?>+',' ',text) #removing HTML Tags
    text = re.sub('\n', ' ',text) #removal of new line characters
    text = re.sub(r'\s+', ' ',text) #removal of multiple spaces
    
    return text

In [46]:
data['summary'] = data['summary'].apply(data_clean)
data['title'] = data['title'].apply(data_clean)

In [47]:
data

Unnamed: 0,title,genre,summary
0,drowned wednesday,fantasy,drowned wednesday first trustee among morrow d...
1,lost hero,fantasy,book opens jason awakens school unable remembe...
2,eyes overworld,fantasy,cugel easily persuaded merchant fianosther att...
3,magic promise,fantasy,book opens herald mage vanyel returning countr...
4,taran wanderer,fantasy,taran gurgi returned caer dallben following ev...
...,...,...,...
4652,hounded,fantasy,atticus sullivan last druids lives peacefully ...
4653,charlie chocolate factory,fantasy,charlie bucket wonderful adventure begins find...
4654,rising,fantasy,live dream children born free says like land f...
4655,frostbite,fantasy,rose loves dimitri dimitri might love tasha ma...


# Preprocessing

In [57]:
def data_preprocess(text):
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lemmetization
    tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens]
    
    # Stemming
    tokens = [SnowballStemmer(language = 'english').stem(word) for word in tokens]
    
    return " ".join(tokens)

In [58]:
data['summary'] = data['summary'].apply(data_preprocess)
data['title'] = data['title'].apply(data_preprocess)
data

Unnamed: 0,title,genre,summary
0,drown wednesday,fantasy,drown wednesday first truste among morrow day ...
1,lost hero,fantasy,book open jason awaken school unabl rememb any...
2,eye overworld,fantasy,cugel easili persuad merchant fianosth attempt...
3,magic promis,fantasy,book open herald mage vanyel return countri va...
4,taran wander,fantasy,taran gurgi return caer dallben follow event t...
...,...,...,...
4652,hound,fantasy,atticus sullivan last druid life peac arizona ...
4653,charli chocol factori,fantasy,charli bucket wonder adventur begin find willi...
4654,rise,fantasy,live dream child born free say like land fathe...
4655,frostbit,fantasy,rose love dimitri dimitri might love tasha mas...


## BERT

In [66]:
import torch

In [67]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [None]:
from transformers import RobertaConfig

# Initializing a RoBERTa configuration
configuration = RobertaConfig()

configuration.num_labels = num_labels

In [69]:
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.10.0-cp39-cp39-win_amd64.whl (5.0 MB)
     ---------------------------------------- 5.0/5.0 MB 3.5 MB/s eta 0:00:00
Collecting tensorflow<2.11,>=2.10.0
  Downloading tensorflow-2.10.1-cp39-cp39-win_amd64.whl (455.9 MB)
     -------------------------------------- 455.9/455.9 MB 2.1 MB/s eta 0:00:00
Collecting tensorflow-hub>=0.8.0
  Downloading tensorflow_hub-0.12.0-py2.py3-none-any.whl (108 kB)
     -------------------------------------- 108.8/108.8 KB 6.2 MB/s eta 0:00:00
Collecting tensorboard<2.11,>=2.10
  Downloading tensorboard-2.10.1-py3-none-any.whl (5.9 MB)
     ---------------------------------------- 5.9/5.9 MB 11.7 MB/s eta 0:00:00
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp39-cp39-win_amd64.whl (895 kB)
     ------------------------------------- 895.9/895.9 KB 11.4 MB/s eta 0:00:00
Collecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting tensorflow-estimator<

You should consider upgrading via the 'C:\Users\anand\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [70]:
# libraries
import tensorflow_hub as hub
import tensorflow_text
import numpy as np
import pandas as pd

In [71]:
sentences=data['summary']

In [72]:
# bert preprocessorhttps://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
# bert encoder https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/2
# encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4",trainable=True)
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/2",trainable=True)



In [87]:
# preprocessing dataset adding cls sep etc 
inputs = preprocessor(sentences)
# feeding it to model for vectorization
outputs = encoder(inputs)

In [88]:
outputs['sequence_output'].shape

TensorShape([4657, 128, 512])

In [89]:
# defining dataframe!!!
temp=pd.DataFrame()

In [90]:
for i in range(0,len(outputs['sequence_output'])):
  b=outputs['sequence_output'][i].numpy().sum(axis=0)
  temp=temp.append(pd.Series(b),ignore_index=True)
print('values added in dataframe')
     

  temp=temp.append(pd.Series(b),ignore_index=True)
  temp=temp.append(pd.Series(b),ignore_index=True)


values added in dataframe


In [92]:
temp.to_csv( "bertFeatures.csv", index=False, encoding='utf-8-sig')

# Import Bert features

In [93]:
bert_features = pd.read_csv("bertFeatures.csv")

In [94]:
bert_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,34.064705,22.735920,-76.582850,-54.817326,-36.221695,3.004882,32.512608,-11.660388,-22.189047,-14.774579,...,-58.249287,-40.098870,-51.064007,43.838610,40.184414,46.617764,57.307360,79.453125,67.457970,60.330944
1,43.899307,50.765804,-78.818410,-4.097647,-40.782326,-9.905100,44.566395,-6.893873,-37.779880,-19.604008,...,-68.393990,-12.560373,-69.058846,53.500942,9.692698,25.433218,40.799380,131.952440,55.487823,44.115578
2,24.891432,60.095640,-52.281883,-26.142065,-70.368670,24.526697,36.205227,-14.707609,-4.904492,-12.611858,...,-74.690920,-21.309294,-92.245476,50.616234,12.086532,27.438631,38.936546,30.548628,21.183413,70.788670
3,34.096478,70.980110,-72.184326,-49.600845,-36.909424,-5.688858,13.047146,15.227018,-31.379810,-45.028973,...,-67.968810,-30.079834,-63.493378,41.449932,-11.426052,54.869556,42.769337,90.878685,68.338590,47.883766
4,29.388363,61.366850,-70.836610,-66.867935,-20.526585,5.349851,11.215246,-4.479660,-36.570850,-9.955334,...,-91.066350,-39.021122,-81.629020,37.976974,-0.961639,76.855940,49.196910,42.216510,67.465775,77.140920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4652,29.080410,48.350160,-92.401460,-37.765910,-52.237633,24.671877,30.593655,23.596363,-30.915812,-17.476912,...,-85.544730,-19.823086,-63.061943,58.735950,-36.939617,39.063965,22.339619,45.772694,31.699293,40.172460
4653,19.129667,37.764442,-41.877934,30.918592,-55.779533,-7.655437,24.861279,3.303953,-93.898900,25.722233,...,-73.550820,-40.980957,-51.615788,38.190166,32.322105,23.301435,37.389750,101.013750,67.409850,9.669949
4654,27.993220,46.849545,-52.659748,-13.563160,-48.469765,22.498436,46.433445,26.110325,-25.438171,-29.787111,...,-85.879850,-25.784689,-47.455830,54.880510,8.015970,41.336716,12.458059,102.638890,21.027193,46.415066
4655,1.865901,18.807083,-28.955233,-30.521412,-21.315687,-13.949568,25.535060,3.473431,-40.239048,-18.172094,...,-78.159840,-12.696408,-51.373646,56.125122,-2.669002,43.187996,48.552357,145.647660,39.892544,22.021894


In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# random forest model
X=bert_features
y=data['genre']
# # defining classifier
clf=RandomForestClassifier()
# # 10 fold cross validation
scores = cross_val_score(clf, X, y, cv=10)

In [98]:
import numpy as np
print('random forest results ->', np.mean(scores))

random forest results -> 0.5031164336148415


In [99]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [100]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

res=clf.predict(X_test)
accuracy_score(y_test, res)

0.5092989985693849

In [105]:
## SVC
from sklearn import svm

clf=svm.SVC()
# 10 fold cross validation
SVCscores = cross_val_score(clf, X, y, cv=10)

In [107]:
print('svc results-->', np.mean(SVCscores))

svc results--> 0.5750473025981817
