In [12]:
import re
import string

from rake_nltk import Metric,Rake
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from summa import keywords

# Load Text of Book

In [2]:
fname="/home/ghavimbm/notebook/test/data/book/War_and_Peace_Book_One.txt"
with open(fname) as f:
    content = f.readlines()

# Data Preprocessing

In [3]:
if "About this digital edition\n" in content:
    content=content[:content.index("About this digital edition\n")]
if "CHAPTER I\n" in content:
    content=content[content.index("CHAPTER I\n"):]   
content=list(filter(lambda a: a != '\n', content))
text, _ = re.subn('\s+', ' ', " ".join(content))
text = text.strip()
sentences = sent_tokenize(text, language = "english")
sentences_new=[]
for item in sentences:
    text1 = item.translate(str.maketrans('','',string.punctuation))
    sentences_new.append(text1)

In [4]:
stop_words = set(stopwords.words('english'))
stop_words.update(["young","old","said","dont","whatever",'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'may', 'also', 'across','among', 'beside', 'however', 'yet', 'within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

# KeyWord Extraction Based on Rake Library

In [5]:
r = Rake(min_length=1, max_length=2,stopwords=stop_words,ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)

In [6]:
r.extract_keywords_from_sentences(sentences_new)

In [7]:
r.get_ranked_phrases()

['ze sanctity',
 'ze empire',
 'youthful egotism',
 'worthier sweeter',
 'womenmostly motherswho',
 'wide sweep',
 'wide courtyard',
 'wicked trick',
 'wellknown grandee',
 'wellgarnished joint',
 'warm lightwere',
 'vile succeed',
 'uncut book',
 'unamiable outburst',
 'twitching violently',
 'twitching increased',
 'twentyfiveruble note',
 'turkish pistols',
 'turkish pipes',
 'tsar enters',
 'tremendous fortune',
 'traveling effects',
 'tiny lamps',
 'thanks thanks',
 'tenacious endurance',
 'teachers fault',
 'tawny tint',
 'tact habitual',
 'sympathetic squeeze',
 'sweetest comforts',
 'sublime rules',
 'striking feature',
 'strict secrecy',
 'strict formality',
 'stared fixedly',
 'spinning mill',
 'spasski hill',
 'sparrow hills',
 'someone unseen',
 'solemnly conducting',
 'soft steps',
 'soft paws',
 'social topics',
 'social experience',
 'social distinctions',
 'smoke spasmodically',
 'smoke ring',
 'smilingly inclining',
 'smelt strongly',
 'sloping ledge',
 'single star',


In [8]:
r.get_ranked_phrases_with_scores()

[(4.0, 'ze sanctity'),
 (4.0, 'ze empire'),
 (4.0, 'youthful egotism'),
 (4.0, 'worthier sweeter'),
 (4.0, 'womenmostly motherswho'),
 (4.0, 'wide sweep'),
 (4.0, 'wide courtyard'),
 (4.0, 'wicked trick'),
 (4.0, 'wellknown grandee'),
 (4.0, 'wellgarnished joint'),
 (4.0, 'warm lightwere'),
 (4.0, 'vile succeed'),
 (4.0, 'uncut book'),
 (4.0, 'unamiable outburst'),
 (4.0, 'twitching violently'),
 (4.0, 'twitching increased'),
 (4.0, 'twentyfiveruble note'),
 (4.0, 'turkish pistols'),
 (4.0, 'turkish pipes'),
 (4.0, 'tsar enters'),
 (4.0, 'tremendous fortune'),
 (4.0, 'traveling effects'),
 (4.0, 'tiny lamps'),
 (4.0, 'thanks thanks'),
 (4.0, 'tenacious endurance'),
 (4.0, 'teachers fault'),
 (4.0, 'tawny tint'),
 (4.0, 'tact habitual'),
 (4.0, 'sympathetic squeeze'),
 (4.0, 'sweetest comforts'),
 (4.0, 'sublime rules'),
 (4.0, 'striking feature'),
 (4.0, 'strict secrecy'),
 (4.0, 'strict formality'),
 (4.0, 'stared fixedly'),
 (4.0, 'spinning mill'),
 (4.0, 'spasski hill'),
 (4.0, 'spa

# Save Result into txt Files

In [10]:
with open('./Data/Rake_Keyword_Extraction.txt', 'w') as f:
    for item in r.get_ranked_phrases():
        f.write("%s\n" % item)
        
with open('./Data/Rake_Keyword_Extraction_Score.txt', 'w') as f:
    for item in r.get_ranked_phrases_with_scores():
        f.write("%s\n" % str(item))

# KeyWord Extraction base on Summa Library

In [10]:
print(keywords.keywords(text))

pierre
prince
princes
smile
smiled
smilingly
looked
looking
look
looks
said suffering
princess
princesse
princesses
anna
face
faces
facing
chapter
boris
bory
paused smiling
little
man
distinct smiles
andrew
suddenly
sudden
evidently
evident
room
rooms
eyes
count
counting
counted
natasha
expression
expressed
express
expressing
expressions
ask
asked
asking
asks
countess
continual
continued
continue
continuing
continually
continuously
continuous
dear
dearly
rounded
voice
voices
came
young
vasili
mikhaylovna
hand
hands
handed
handing
yes
coming
come
comes
turned
turning
turn
turns
guests began
reply
likely
liked
likes
glance
glancing
glanced
glances
ah
times like
nicholas
great
greatness
greatly
talk
talking
talked
talks
time
pavlovna
good
goodness
lightly
light
lights
lighted
sonya
stood
dolokhov
went
arm
arms
son
sons
played round
things
thing
head
heads
watchful eye
grown old
large
letting
let
french
listen
listening
listened
listener
anatole
marya
lips
lip
dressed
dresses
dressing
fath

# Save Result into txt Files

In [13]:
with open('./Data/Suma_Keyword_Extraction.txt', 'w') as f:
    for item in keywords.keywords(text):
        f.write("%s\n" % str(item))

# KeyWord Extraction based Tf-IDf

In [14]:
sentences_new_split=[]
for item in sentences_new:
    item=removeStopWords(item)
    sentences_new_split.append(item)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
#eliminate stop words
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(2, 4), norm='l2', min_df=10,max_df=0.4)
vectorizer.fit(sentences_new_split)
feature_array = np.array(vectorizer.get_feature_names())
response = vectorizer.transform(sentences_new)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

n = 50
top_n = feature_array[tfidf_sorting][:n]


In [19]:
with open('./Data/TF-IDF.txt', 'w') as f:
    for item in top_n:
        f.write("%s\n" % str(item))

In [13]:
top_n

array(['little princess', 'princess mary', 'prince vasili',
       'prince hippolyte', 'prince andrew', 'monsieur pierre',
       'marya dmitrievna', 'mademoiselle bourienne', 'let us',
       'drawing room', 'cyril vladimirovich', 'anna pavlovnas',
       'anna pavlovna', 'anna mikhaylovna', 'princess mary',
       'prince vasili', 'prince hippolyte', 'prince andrew',
       'monsieur pierre', 'marya dmitrievna', 'mademoiselle bourienne',
       'little princess', 'let us', 'drawing room', 'cyril vladimirovich',
       'anna pavlovnas', 'anna pavlovna', 'anna mikhaylovna',
       'princess mary', 'prince vasili', 'prince hippolyte',
       'prince andrew', 'monsieur pierre', 'marya dmitrievna',
       'mademoiselle bourienne', 'little princess', 'let us',
       'drawing room', 'cyril vladimirovich', 'anna pavlovnas',
       'anna pavlovna', 'anna mikhaylovna', 'prince andrew',
       'princess mary', 'prince vasili', 'prince hippolyte',
       'monsieur pierre', 'marya dmitrievna', '