In [None]:
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import matplotlib.pyplot as plt
import pandas as pd
nltk.download('punkt')


In [None]:
#download necessary library from nltk 
nltk.download('stopwords')
nltk.download('punkt')

In [13]:
book_data = pd.read_csv('../Data/duke_books.csv')
book_data.head()

#see how many NA's are in the summary column 
book_data['Summary'].isna().sum()

#drop NA values for now
book_data = book_data.dropna(subset = ['Summary']).reset_index(drop = True)

#look at data head
book_data.head()

Unnamed: 0,Title,Location,Authors,Summary,Published,Language,System Details,Notes,Description,Description Details,Genre,OCLC,Other Identifiers,System ID
0,1998 vital statistics of the United States : v...,Perkins Public Documents/Maps,National Center for Health Statistics (U.S.),"This CD ROM contains chiefly tables, a guide t...","[Washington, D.C.?] : Dept. of Health and Huma...",English,System requirements for Windows: IBM-compatibl...,"Title from title screen.ISO 9660 format.""Shipp...",1 CD-ROM ; 4 3/4 in.,Dimensions: 4 3/4 in.Color characteristics: po...,"TablesStatistics, Vital",49537646.0,GPO Item Number: 0510,3072700
1,12th Education and Training in Optics and Phot...,,,Proceedings of SPIE present the original resea...,"[S.l.] : [s.n.], 9999.",,,Title from content provider.,1 online resource,,Electronic books,,,8891635
2,13th International Scientific Conference on Op...,,,Proceedings of SPIE present the original resea...,"[S.l.] : [s.n.], 9999.",,,Title from content provider.,1 online resource,,Electronic books,,,8891641
3,16th International Workshop on Physics of Semi...,,,Proceedings of SPIE present the original resea...,"[S.l.] : [s.n.], 9999.",,,Title from content provider.,1 online resource,,Electronic books,,,8891649
4,17th-18th century Burney Collection newspapers...,,"Burney, Charles, 1757-1817",Searchable full-text access to the British Lib...,"[Farmington Hills, Mich.] : Gale Cengage Learn...",English,,,1 online resource,Color characteristics: polychromeFile type: te...,SourcesIndexesOnline databasesElectronic refer...,182626961.0,LCCN: 2013238254,3975405


In [14]:
#many summaries are short and so we don't need to summarize them, lets make a flag column for summary being > 100 words 
def word_count(text): 
    words = text.split()
    return len(words)

#create word count column
book_data['word_count'] = book_data['Summary'].apply(word_count)

In [42]:
#filter for books with longer summaries
book_long = book_data[book_data['word_count'] >= 100] 

#drop everything but title and summary 
book_long = book_long.loc[:, ['Title', 'Summary']]

#generate full combinations of title and text
book_long['full_text'] = book_long.apply(lambda x: ' '.join([x['Title'],x['Summary']]),axis=1)

book_long['full_text'] = book_long['full_text'].astype(str)

#see how many observations we have
display(len(book_long))
display(book_long.head())



print(book_long.iloc[0][2])

2755

Unnamed: 0,Title,Summary,full_text
119,"African American newspapers, 1827-1998. - Duke...",Provides access to U.S. newspapers chronicling...,"African American newspapers, 1827-1998. - Duke..."
130,All the world's primates. - Duke University Li...,All the World's Primates is the comprehensive ...,All the world's primates. - Duke University Li...
137,The American bench. - Duke University Librarie...,Court profiles on both federal and state court...,The American bench. - Duke University Librarie...
140,American Law Institute library. - Duke Univers...,The American Law Institute library on HeinOnli...,American Law Institute library. - Duke Univers...
172,Archives of sexuality & gender. LGBTQ history ...,As part of the Archives of Sexuality & Gender ...,Archives of sexuality & gender. LGBTQ history ...


African American newspapers, 1827-1998. - Duke University Libraries Catalog Provides access to U.S. newspapers chronicling a century and a half of the African American experience. Includes historically significant papers from more than 35 states and features many rare 19th-century titles. Titles in Series 1 come from the Wisconsin Historical Society, Kansas State Historical Society and the Library of Congress, while titles in Series 2 come from the American Antiquarian Society, Center for Research Libraries, the Library of Congress, and New York Public Library. Covers life in the Antebellum South, growth of the Black church, the Jim Crow Era, the Great Migration, Harlem Renaissance, Civil Rights movement, and political and economic empowerment.


In [55]:
def extractive_summaries(full_text): 

    #tokenize sentences
    sentences = sent_tokenize(full_text)

    #strip alpha numeric characters and stopwords 
    sentences_processed = []
    for sentence in sentences:
        sentence_reduced = sentence.replace("[^a-zA-Z0-9_]", '')
        sentence_reduced = [word.lower() for word in sentence_reduced.split(' ') if word.lower() not in stopwords.words('english')]
        sentences_processed.append(' '.join(word for word in sentence_reduced))

    #create TFIDF feature vecs
    vectorizer = TfidfVectorizer()
    feature_vecs = vectorizer.fit_transform(sentences_processed)
    feature_vecs = feature_vecs.todense().tolist()

    # Create empty adjacency matrix
    adjacency_matrix = np.zeros((len(feature_vecs), len(feature_vecs)))
 
    # Populate the adjacency matrix using the similarity of all pairs of sentences
    for i in range(len(feature_vecs)):
        for j in range(len(feature_vecs)):
            if i == j: #ignore if both are the same sentence
                continue 
            adjacency_matrix[i][j] = 1 - cosine_distance(feature_vecs[1], feature_vecs[j])

    # Create the graph representing the document
    document_graph = nx.from_numpy_array(adjacency_matrix)

    # Apply PageRank algorithm to get centrality scores for each node/sentence
    scores = nx.pagerank(document_graph)
    scores_list = list(scores.values())

    # Sort and pick top sentences
    ranking_idx = np.argsort(scores_list)[::-1]
    ranked_sentences = [sentences[i] for i in ranking_idx]   

    summary = []
    top_n = 1
    for i in range(top_n):
        summary.append(ranked_sentences[i])

    summary = " ".join(summary)
    
    return summary



In [56]:

book_long['extractive_summary'] = book_long['full_text'].apply(extractive_summaries)



PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')