In [12]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
from pathlib import Path
!pip install -U transformers

WD = Path.cwd().parent



In [13]:
sections_frame = WD / 'data' / 'sections_frame.csv'

if not sections_frame.exists():
    with open (WD / 'bin' / 'chapter_urls.json', 'r') as f:
        chapter = json.load(f)
    with open (WD / 'bin' / 'summary_urls.json', 'r') as f:
        summary_urls = json.load(f)
        
    def getText(URL):
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        all_text = soup.get_text().lstrip('Learning Objectives').rstrip('PreviousNext')
        selection_text = re.sub('PreviousNext.+', '', all_text, flags=re.DOTALL)
        selection_text = re.sub('.+Learning Objectives', '', selection_text, flags=re.DOTALL)
        return(selection_text)
    
    sections_dict = {}
    
    sections_dict['section'] = [i for i in chapter_urls]
    sections_dict['text'] = [getText(chapter_urls[i]) for i in chapter_urls]

    sections_df = pd.DataFrame.from_dict(sections_dict)
    sections_df['chapter'] = sections_df['section'].apply(lambda x: x.split('-')[0]).astype(int)
    sections_df = sections_df[['chapter', 'section', 'text']]
    
    def getSummary(row):
        url = summary_urls[row.chapter]
        html_content = requests.get(url).text
        soup = BeautifulSoup(html_content, 'html.parser')
        texts = soup.find_all('p')
        text_list = [text.get_text() for text in texts]
        section_num = int(row['section'].split('-')[1])
        return text_list[section_num-1]
 
    sections_df['textbook_summary'] = sections_df.apply(lambda row: getSummary(row), axis = 1)
    
    sections_df.to_csv(sections_frame, index=False)

else:
    sections_df = pd.read_csv(sections_frame)

In [14]:
sections_df

Unnamed: 0,chapter,section,text,textbook_summary
0,1,01-1,"By the end of this section, you will be able t...",Economics seeks to solve the problem of scarci...
1,1,01-2,"By the end of this section, you will be able t...",Microeconomics and macroeconomics are two diff...
2,1,01-3,"By the end of this section, you will be able t...",Economists analyze problems differently than d...
3,1,01-4,"By the end of this section, you will be able t...","We can organize societies as traditional, comm..."
4,2,02-1,"By the end of this section, you will be able t...",Economists see the real world as one of scarci...
...,...,...,...,...
89,21,21-1,"By the end of this section, you will be able t...",There are three tools for restricting the flow...
90,21,21-2,"By the end of this section, you will be able t...","As international trade increases, it contribut..."
91,21,21-3,"By the end of this section, you will be able t...",In thinking about labor practices in low-incom...
92,21,21-4,"By the end of this section, you will be able t...",There are a number of arguments that support r...


## Get T5

In [15]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
#from rouge_score import rouge_scorer

model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary").to('cuda')
tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")

text = sections_df['text'][0].encode('ascii', 'ignore').decode()
#scorer = rouge_scorer.RougeScorer(['rougeLsum'], use_stemmer=True)

In [14]:
def summarizeText(text, num_beams):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", truncation=False).to('cuda')
    outputs = model.generate(
        inputs["input_ids"],
        max_length=200,
        min_length=50,
        repetition_penalty=.8,
        # length_penalty=2,
        # early_stopping=True,
        num_beams=num_beams,
        do_sample=True,
        # top_k=50, 
        # top_p=0.95, 
        num_return_sequences=10,
        # temperature = 0.8,
    )
    return outputs

def getRouge(textbook_summary, T5_summary):
    precision, recall, fmeasure = scorer.score(textbook_summary, T5_summary)['rougeLsum']
    return fmeasure

In [9]:
output=summarizeText(text)

In [11]:
for i in output:
    print(tokenizer.decode(i, skip_special_tokens=True))

In this chapter, students will learn how to analyze and interpret economic problems in order to make informed decisions. In order to understand economics, they must first understand the concept of "division and specialization" of labor. Adam Smith, the first scholar of economics in 1776, first proposed this concept in his book The World of Nations. In his book, Smith explains that people have different skills and interests so that they can better perform certain tasks than others. In other words, people can be better at certain jobs than others because they have different talents and interests. For instance, people who specialize in a certain task can learn to produce faster and with better quality than those who only produce a few things. If a business concentrates on one product, it will be more successful than if it produces a wide variety of products. If, on the other hand, it produces many different products, the cost of producing one product will be lower than that of
In this cha

In [18]:
section_num = 1
summaries_dict = {}
# for text in sections_df[sections_df['chapter'] == 1]:
#     outputs = summarizeText(text)
#     summaries = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
#     summaries_dict[section_num] = summaries
#     section_num +=1

for row in sections_df[sections_df['chapter'] ==  1].iterrows():                                                    
    outputs = summarizeText(row[1]['text'], num_beams = section_num)
    summaries = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
    summaries.append(row[1]['textbook_summary'])
    summaries_dict[section_num] = summaries 
    section_num += 1

In [20]:
pd.DataFrame.from_dict(summaries_dict).to_csv('../data/summaries_df.csv')

In [16]:
chapter1_df = sections_df[sections_df['chapter'] == 1]

In [17]:
for i in range(10):
    print('generating summary using', str(i+1), 'beams')
    summary_name = 'summary'+str(i+1)
    rouge_name = 'rougeF1'+str(i+1)
    chapter1_df[summary_name] = chapter1_df.apply(lambda row: summarizeText(row['text'], num_beams=i+1), axis=1)
    chapter1_df[rouge_name] = chapter1_df.apply(lambda row: getRouge(row['textbook_summary'], row[summary_name]), axis=1)

generating summary using 1 beams


TypeError: summarizeText() got an unexpected keyword argument 'beams'

In [None]:
chapter1_df.to_csv('../data/chapter1_summaries.csv')

In [None]:
chapter1_df

In [None]:
precision, recall, fmeasure = scorer.score(chapter1_df.iloc[0]['textbook_summary'], chapter1_df.iloc[0]['summary10'])['rougeLsum']

fmeasure