In [1]:
import re
import string

import nltk
import numpy as np
import pandas as pd
import networkx as nx

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm_notebook as tqdm

# Load Text of Book

In [2]:
fname="/home/ghavimbm/notebook/test/data/book/War_and_Peace_Book_One.txt"
with open(fname) as f:
    content = f.readlines()

# Data Preprocessing

In [3]:
if "About this digital edition\n" in content:
    content=content[:content.index("About this digital edition\n")]
if "CHAPTER I\n" in content:
    content=content[content.index("CHAPTER I\n"):]   
content=list(filter(lambda a: a != '\n', content))
text = " ".join(content).lower()
text, _ = re.subn('\s+', ' ', text)
text = text.strip()

In [4]:
sentences = sent_tokenize(text, language = "english")

In [5]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'may', 'also', 'across','among', 'beside', 'however', 'yet', 'within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

In [6]:
def preprocess_text(sentences):
    ls_items=[]
    for item in sentences:
        text = removeStopWords(item.translate(str.maketrans('','',string.punctuation)))
        text, _ = re.subn('\s+', ' ', text)
        ls_items.append(text)
    return ls_items

In [7]:
clean_sentences=preprocess_text(sentences)

# Load pretrained Glove Wordembeding representation

In [8]:
word_embeddings = {}
f = open('/home/ghavimbm/notebook/test/glove_we/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

# Generate Vector of sentences based on wordembeding

In [9]:
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

# Similarity Matrix

In [10]:
sim_mat = np.zeros([len(sentences), len(sentences)])
for i in tqdm(range(len(sentences))):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

HBox(children=(IntProgress(value=0, max=3005), HTML(value='')))




# PageRank Algorithm

In [11]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [12]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

# Sumerized Text based on textRank algorithm, Select top 10 sentence

In [15]:
textRankSumery=[]
for i in range(10):
    textRankSumery.append(ranked_sentences[i][1])
    print(ranked_sentences[i][1]+"\n")

she evidently considered it proper to show an interest in the general conversation by smiling, but in spite of herself her eyes under their thick long lashes watched her cousin who was going to join the army, with such passionate girlish adoration that her smile could not for a single instant impose upon anyone, and it was clear that the kitten had settled down only to spring up with more energy and again play with her cousin as soon as they too could, like natasha and boris, escape from the drawing room.

halfway through supper prince andrew leaned his elbows on the table and, with a look of nervous agitation such as pierre had never before seen on his face, began to talk--as one who has long had something on his mind and suddenly determines to speak out.

pierre heard her say: "certainly he must be moved onto the bed; here it will be impossible..." the sick man was so surrounded by doctors, princesses, and servants that pierre could no longer see the reddish-yellow face with its gray

In [26]:
with open('./Data/textRankSumery.txt', 'w') as f:
    for item in textRankSumery:
        f.write("%s\n" % item)

# Sumerization based on TF-IDF

In [16]:
ls_senteces=[]
for i in range(0,len(sentences)):
    ls_senteces.append((sentences[i],clean_sentences[i]))

In [17]:
df_sentences=pd.DataFrame(ls_senteces)

In [18]:
df_sentences.columns=["sentence","clean_sentence"]

In [19]:
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 1), norm='l2', min_df=2,max_df=0.6)
vectorizer.fit(list(df_sentences["clean_sentence"]))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [20]:
df_sentences["tf_idf"]=df_sentences["clean_sentence"].apply(lambda x:vectorizer.transform([x]))

In [21]:
df_sentences["mean_tf_idf"]=df_sentences["tf_idf"].apply(np.mean)

In [22]:
df_sentences.sort_values("mean_tf_idf",ascending=False,inplace=True)

In [23]:
df_sentences.reset_index(drop=True,inplace=True)

In [27]:
with open('./Data/tf_idf_summery.txt', 'w') as f:
    for item in list(df_sentences[:10]["sentence"]):
        f.write("%s\n" % item)

In [28]:
list(df_sentences[:10]["sentence"])

['as soon as he had seen a visitor off he returned to one of those who were still in the drawing room, drew a chair toward him or her, and jauntily spreading out his legs and putting his hands on his knees with the air of a man who enjoys life and knows how to live, he swayed to and fro with dignity, offered surmises about the weather, or touched on questions of health, sometimes in russian and sometimes in very bad but self-confident french; then again, like a man weary but unflinching in the fulfillment of duty, he rose to see some visitors off and, stroking his scanty gray hairs over his bald patch, also asked them to dinner.',
 'pierre wished to make a remark, for the conversation interested him, but anna pavlovna, who had him under observation, interrupted: "the emperor alexander," said she, with the melancholy which always accompanied any reference of hers to the imperial family, "has declared that he will leave it to the french people themselves to choose their own form of gover