In [5]:
import pandas as pd
# read the excel file
df = pd.read_excel("Articles with Rules.xlsx", usecols = [1,2])
# limit dataframe to the first two columns (url and term)
df = df[1:-1]
# exclude unknown data from the dataframe
df = df.fillna('Unknown')

# Following contains scraped data from healthline and eatingwell (notify the method can measure scraped data less than 4096 tokens)

In [2]:
# refine the data frame which only contains healthline
df1 = df[df['URL'].str.contains('healthline')]
# refine the data frame which only contains eatingwell
df2 = df[df['URL'].str.contains('eatingwell')]

In [3]:
# scrape information from url in the dataframe for healthline
import requests
from bs4 import BeautifulSoup
# set up a header to inform website to avoid 418 error (you will need to change this setting)
headers = {"User-Agent":'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
results_healthline = []
title_healthline = []
for url in df1['URL']:
    response = requests.get(url, headers = headers).text
    soup = BeautifulSoup(response, 'html.parser')
    # find the section of the page that contains the article text
    article_section = soup.find('article', {'class': 'article-body css-d2znx6 undefined'})
    title_section = soup.find('title')
    # extract the text content of the article
    article_text = article_section.text.strip()
    title_text = title_section.text.strip()

    results_healthline.append(article_text)
    title_healthline.append(title_text)

# set up dataframe to store information
df_results1 = pd.DataFrame({'url': df1['URL'], 'title': title_healthline,
                            'data': results_healthline})
df_results1

Unnamed: 0,url,title,data
9,https://www.healthline.com/health/pregnancy/no...,7 Soups to Replenish and Rejuvenate the Postpa...,Share on PinterestBefore welcoming a new child...
28,https://www.healthline.com/health/pregnancy/di...,Healthy Diet During Pregnancy,Share on PinterestIf you’re concerned about wh...
42,https://www.healthline.com/health/pregnancy/po...,11 Best Postnatal Vitamins of 2023,Share on PinterestWe include products we think...
43,https://www.healthline.com/health/pregnancy/nu...,Nutritional Needs During Pregnancy,"Share on PinterestAs you probably know, your b..."
44,https://www.healthline.com/nutrition/13-foods-...,13 Foods to Eat When You're Pregnant,"While you’re pregnant, you’ll want to eat extr..."
45,https://www.healthline.com/health/pregnancy/be...,Fruits to Eat During Pregnancy: Nutritious Opt...,Eating nutritious food is important for you an...
46,https://www.healthline.com/health/pregnancy/se...,"Second Trimester Diet: Daily Requirements, Cra...",Share on PinterestCopyright: Dean MitchellWe i...
47,https://www.healthline.com/health/postpartum-diet,Postpartum Diet Plan: Tips for Healthy Eating ...,Share on PinterestIt’s no secret that the food...
48,https://www.healthline.com/health-news/eating-...,Pregnancy Diet and Risk of Low Birth Weight,Share on PinterestBoris Jovanovic/Stocksy Unit...
49,https://www.healthline.com/nutrition/supplemen...,Supplements During Pregnancy: What’s Safe and ...,"Share on PinterestIf you’re pregnant, you may ..."


In [4]:
# scrope information from url in the dataframe for eatingwell
import requests
from bs4 import BeautifulSoup
# setup a header to inform website to avoid 418 error
headers = {"User-Agent":'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
results_eatingwell = []
title_eatingwell = []

for url in df2['URL']:
    try:
        response = requests.get(url, headers = headers).text
        soup = BeautifulSoup(response, 'html.parser')
        # find the section of the page that contains the article text
        article_section = soup.find('div', {'class': 'loc article-content'})

        title_section = soup.find('title')
        # Extract the text content of the article
        article_text = article_section.text.strip()
        results_eatingwell.append(article_text)
        title_text = title_section.text.strip()
        title_eatingwell.append(title_text)
    except:
        results_eatingwell.append("NaN")
        title_eatingwell.append('NaN')
# set up dataframe to store information
df_results2 = pd.DataFrame({'url': df2['URL'], 'title': title_eatingwell,
                            'data': results_eatingwell})
df_results2

Unnamed: 0,url,title,data
33,https://www.eatingwell.com/article/290540/what...,What to Eat When You're Pregnant: First Trimester,"Welcome to the first trimester of pregnancy, c..."
34,https://www.eatingwell.com/article/7900675/top...,"Top 10 Pregnancy Superfoods, According to Diet...",We independently evaluate all recommended prod...
35,https://www.eatingwell.com/article/290541/what...,What to Eat When You're Pregnant: Second Trime...,Congratulations! You made it to the second tri...
36,https://www.eatingwell.com/article/290598/what...,What a Healthy Day of Pregnancy Eating Looks L...,"It's the third trimester, and you're in the ho..."
37,https://www.eatingwell.com/article/15660/healt...,,
38,https://www.eatingwell.com/article/290403/food...,Foods for Morning Sickness,It could be a faint whiff of cilantro in a sal...


In [None]:
# combine two dataframe into one
df_final = pd.concat([df_results1,df_results2], axis = 0)
df_final.to_csv('scraped_data.csv', index=False)

In [None]:
import openai

In [None]:
#enter your api key here
openai.api_key =''

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
# write a function to reduce additional blanks
def reduce_long_blanks(content):
    sentences = sent_tokenize(content)
    filtered_sentencess = [sentence for sentence in sentences if sentence.strip()]
    summary = ' '.join(filtered_sentencess)
    summary = summary.replace('\n','')
    summary = summary.replace('\\','')
    return summary

In [None]:
# checking number of tokens of each scraped answer
for i in range(len(df_final)):
    re = reduce_long_blanks(df_final.iloc[i,2])
    tokens = nltk.word_tokenize(re)
    num_tokens = len(tokens)
    print("number of tokens", num_tokens)

In [None]:
# use this method if you notify that number of tokens exceed 4096
summary = []
import nltk

# set up max number of tokens of each time getting response from chatgpt
max = 2750

for i in range(len(df_final)):
    # get article title and content
    article_title = df_final.iloc[i, 1]
    article_content = df_final.iloc[i, 2]

    # reduce blanks
    reduced_content = reduce_long_blanks(article_content)
    # tokenize the content
    tokens = nltk.word_tokenize(reduced_content)

    # check if tokens exceed max tokens
    if len(tokens) > max:
        # location offset of start token
        start = 0
        # trial number of separate response
        count = 0
        # sub-summary
        small_summary = []
        while start < len(tokens):
            if count == 4:
                break
            chunk_tokens = tokens[start: start + max]
            chunk_content = " ".join(chunk_tokens)
            scraped_data = "content is " + chunk_content
            if count == 0:
                prompt = "Summarize this for me: " + scraped_data + ". I will send the next part later."
            elif count == 1:
                prompt = "Here is the second part for summarization" + scraped_data
            elif count == 2:
                prompt = "Here is the third part for summarization" + scraped_data
            else:
                prompt = "Here is the last part for summarization: " + scraped_data
            # get summary from GPT
            response = openai.Completion.create(
                engine='text-davinci-003',
                prompt=prompt,
                max_tokens=400,
                temperature=0.3,
                n=1,
                stop=None,
            )
            res = response.choices[0].text.strip()
            small_summary.append(res)
            start += max
            count += 1
        # summarize again for all short summaries
        small_summary_tostr = ' '.join(small_summary)

        all_prompt = "Summarize this for me: " + small_summary_tostr
        response = openai.Completion.create(
            engine='text-davinci-003',
            prompt=all_prompt,
            max_tokens=400,
            temperature=0.3,
            n=1,
            stop=None,
        )
        res = response.choices[0].text.strip()
        summary.append(res)
    else:
        # combine title and content
        scraped_data = "title is " + article_title + ", and content is " + reduced_content
        prompt = "Summarize this for me:" + scraped_data

        # get summary from GPT
        response = openai.Completion.create(
            engine='text-davinci-003',
            prompt=prompt,
            max_tokens=400,
            temperature=0.3,
            n=1,
            stop=None,
        )
        res = response.choices[0].text.strip()

        summary.append(res)

# create a dataframe for summaries
df_summary = pd.DataFrame(summary, columns=["Summary"])
# drop off index of original dataframe
df_final = df_final.reset_index(drop=True)
# combine dataframes
df_gpt_summary = pd.concat([df_final, df_summary], ignore_index=True, axis=1)
df_gpt_summary.columns = ["Url", "Title", "Data", "Summary"]
df_gpt_summary.drop(columns = "Data", inplace = True)
df_gpt_summary.to_excel('summary.xlsx')

# The following is scraped data from parents and romper (notify the method only measures scraped data less than 4096 tokens)

In [None]:
df3 = df[df['URL'].str.contains('parents')]
df4 = df[df['URL'].str.contains('romper')]

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {"User-Agent":'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}

results_parents = []
title_parents= []
for url in df3['URL']:
    response = requests.get(url, headers = headers).text
    soup = BeautifulSoup(response, 'html.parser')

    # Adjusting the classes to match the website's HTML structure
    article_section = soup.find('div', {'class': 'loc article-content'})
    title_section = soup.find('title')

    if article_section and title_section:
        # Extracting the text content of the article
        article_text = article_section.text.strip()
        title_text = title_section.text.strip()

        results_parents.append(article_text)
        title_parents.append(title_text)
    else:
        results_parents.append("None")
        title_parents.append("None")

# Set up DataFrame to store information
df_results_parents = pd.DataFrame({
    'url': df3['URL'],
    'title': title_parents,
    'data': results_parents
})

In [59]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {"User-Agent":'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}

results_romper = []
title_romper = []
for url in df4['URL']:
    response = requests.get(url, headers = headers).text
    soup = BeautifulSoup(response, 'html.parser')

    # Adjusting the classes to match the website's HTML structure
    article_section = soup.find('div', {'class': 'AOL Afg'})
    title_section = soup.find('title')

    if article_section and title_section:
        # Extracting the text content of the article
        article_text = article_section.text.strip()
        title_text = title_section.text.strip()

        results_romper.append(article_text)
        title_romper.append(title_text)
    else:
        results_romper.append("None")
        title_romper.append("None")

# Set up DataFrame to store information
df_results_romper = pd.DataFrame({
    'url': df4['URL'],
    'title': title_romper,
    'data': results_romper
})

In [None]:
df_final2 = pd.concat([df_results_parents, df_results_romper], axis=0)
df_final2

In [None]:
# checking number of tokens of each scraped answer
for i in range(len(df_final2)):
    re = reduce_long_blanks(df_final2.iloc[i, 2])
    tokens = nltk.word_tokenize(re)
    num_tokens = len(tokens)
    if num_tokens >= 2750:
        print("number of tokens", num_tokens)

In [75]:
import nltk
summary=[]
for i in range(len(df_final2)):
    # get article title and content
    article_title = df_final2.iloc[i,1]
    article_content = df_final2.iloc[i,2]

    # reduce blanks
    reduced_content = reduce_long_blanks(article_content)
    # tokenize the content
    tokens = nltk.word_tokenize(reduced_content)
    # combine title and content
    scraped_data = "title is " + article_title +", and content is " + reduced_content
    prompt = "Summarize this for me:" + scraped_data

    # get summary from GPT
    response = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        max_tokens=400,
        temperature=0.3,
        n=1,
        stop=None,
    )
    res = response.choices[0].text.strip()

    summary.append(res)

df_summary2 = pd.DataFrame(summary, columns=["Summary"])
# drop off index of original dataframe
df_final2 = df_final2.reset_index(drop=True)
# combine dataframes
df_gpt_summary2 = pd.concat([df_final2, df_summary2], ignore_index=True, axis=1)
df_gpt_summary2.columns = ["Url", "Title", "Data", "Summary"]
df_gpt_summary2.drop(columns = "Data", inplace = True)
df_gpt_summary2.to_excel('summary2.xlsx')