In [4]:
%%capture
from pathlib import Path
import pandas as pd
import numpy as np
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn import utils
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import json

In [2]:
DATA = Path.cwd().parent / 'data'
SUMM_FOLDER = DATA / 'summaries_finetune'
TEXT_FILES = SUMM_FOLDER / 'text_files_copy'
SOURCE_TEXTS = SUMM_FOLDER / 'source_texts_clean'

In [20]:
data = open(DATA / 'source_dict.txt', 'r')
source_dict = json.loads(data.read())
textbook_df = pd.DataFrame(list(source_dict.values()))
textbook_df.columns=['text']

In [21]:
summaries_df = pd.read_csv(SUMM_FOLDER / 'final_summaries_ai_aloe_fixed.csv', index_col=False)[['text']]
all_text_df = pd.concat([textbook_df, summaries_df]).reset_index()

Unnamed: 0,index,text
0,0,"By the end of this section, you will be able t..."
1,1,"By the end of this section, you will be able t..."
2,2,"By the end of this section, you will be able t..."
3,3,"By the end of this section, you will be able t..."
4,4,"By the end of this section, you will be able t..."
...,...,...
4779,4685,The results from many studies indicate that vi...
4780,4686,People are being advised to spend less time in...
4781,4687,We are thus in a situation where people are re...
4782,4688,"There are two types of cancers, melanoma and b..."


In [23]:
import nltk
from nltk.corpus import stopwords

def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [24]:
all_tokenized = all_text_df['text'].apply(lambda t: tokenize_text(t)).to_frame()
all_tagged = all_tokenized.apply(lambda r: TaggedDocument(words=r['text'], tags='text'), axis=1)

In [25]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample=0)
model_dbow.build_vocab([x for x in tqdm(all_tagged)])
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_tagged.values)]), total_examples=len(all_tagged.values), epochs=1)
# for i in range(len(text_tokenized)):
#     text_vector = model_dbow.infer_vector(text_tokenized.iloc[i]['text'])
#     source_vector = model_dbow.infer_vector(source_tokenized.iloc[i]['text'])
#     cos_similarities.append(1 - spatial.distance.cosine(text_vector, source_vector))
# df['doc2vec_cos'] = cos_similarities
# return df

100%|██████████| 4784/4784 [00:00<00:00, 1489426.24it/s]
100%|██████████| 4784/4784 [00:00<00:00, 4833907.57it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1312417.45it/s]
100%|██████████| 4784/4784 [00:00<00:00, 2107725.88it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1598593.88it/s]
100%|██████████| 4784/4784 [00:00<00:00, 3649609.01it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1631478.20it/s]
100%|██████████| 4784/4784 [00:00<00:00, 3708974.18it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1630550.17it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1630815.21it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1573151.73it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1612079.24it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1577976.59it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1630947.76it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1609622.20it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1649720.49it/s]
100%|██████████| 4784/4784 [00:00<00:00, 1588721.33it/s]
100%|██████████| 4784/4784 [00:

## Loading the model from file and using it for inference

In [37]:
%%capture
!pip install gradio
import gradio as gr
from scipy import spatial

In [55]:
import nltk
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from scipy import spatial

# Load the source_dict
data = open(DATA / 'source_dict.txt', 'r')
source_dict = json.loads(data.read())

# Load the model
model_path = '../bin/doc2vec_model'
model = Doc2Vec.load(model_path)

# A function to tokenize the text
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

# A function to get the score
def getSimilarity(summary, chapter):
    summary_embedding = model.infer_vector(tokenize_text(summary))
    section_embedding = model.infer_vector(tokenize_text(chapter))
    return 1 - spatial.distance.cosine(summary_embedding, section_embedding)


# Here it is in practice
section1summary = "Economics seeks to solve the problem of scarcity, which is when human wants for goods and services exceed the available supply. A modern economy displays a division of labor, in which people earn income by specializing in what they produce and then use that income to purchase the products they need or want. The division of labor allows individuals and firms to specialize and to produce more for several reasons: a) It allows the agents to focus on areas of advantage due to natural factors and skill levels; b) It encourages the agents to learn and invent; c) It allows agents to take advantage of economies of scale. Division and specialization of labor only work when individuals can purchase what they do not produce in markets. Learning about economics helps you understand the major problems facing the world today, prepares you to be a good citizen, and helps you become a well-rounded thinker." 
off_topic = "Sometimes football is not beautiful. Sometimes doing just enough, for just long enough, can lay the foundation, open the door. So it was as a counter-attack ended Australia’s agony and Denmark’s World Cup campaign. It took 60 minutes for Mat Leckie to score, an endless hour of mostly last-ditch defending and some positive moments, and a goal from Tunisia against France that meant the Socceroos would have to win or be out themselves.In the end it was Denmark’s second-half impotence that ended their tournament prematurely, their early brightness dissipating in the face of a Socceroos side which left it late but rallied when it had to and once again displayed a level of quality belying their inexperience."
print(getScore(section1summary, source_dict['01-1']))

0.6679088473320007


0.6583132743835449


In [None]:

# demo = gr.Interface(
#     fn=getScore,
#     inputs=[gr.Textbox(lines=2, placeholder="Summary..."), gr.Dropdown(label = "Chapter", choices = list(source_dict.keys())),],
#     outputs=[gr.Number(label = "Wording Score"), gr.Number(label="Content Score")],
#     title="Automatic Summary Scorer",
#     description="Automatic Summary Scorer for OpenStax Macroeconomics Textbook",
#     article="This is an app which provides two scores for summaries of chapters in the OpenStax textbook on Macroeconomics. The source text can be found at https://openstax.org/books/principles-macroeconomics-ap-courses-2e/pages/1-key-concepts-and-summary"
# )

# demo.launch()