# 1. Scraping university student newspapers

## Read the data needed for scraping

In [1]:
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
import pandas as pd
import re
import os
import glob
import time

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Scraping function

In [3]:
def scrape_page(base_url, element_selector, page_num):
    url = base_url.format(page_num)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    a_elements = soup.select(element_selector)
    hrefs = [a_element['href'] for a_element in a_elements if 'href' in a_element.attrs]
    print("done scraping page", page_num)
    return hrefs

# def scrape_article(main_url, link, title_selector, author_selector, date_selector, text_selector):
#     if link.startswith('http'):
#         link = link
#     elif link.startswith('/'):
#         link = main_url + link
#     else:
#         link = main_url + '/' + link
#     response = requests.get(link, headers=headers)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     date_element = soup.select_one(date_selector)
#     if date_element:
#         date = date_element.text
#     else:
#         date = 'N/A'
#     title_element = soup.select_one(title_selector)
#     if title_element:
#         title = title_element.text
#     else:
#         title = 'N/A'
#     author_element = soup.select_one(author_selector)
#     if author_element:
#         author = author_element.text
#     else:
#         author = 'N/A'
#     text_elements = soup.select(text_selector)
#     text = ' '.join([text_element.text.strip() for text_element in text_elements])
#     print(f"collected {link} articles")
#     return {'title':title,'author':author,'date': date, 'text': text, 'link': link}

def scrape_article(main_url, link, title_selector, author_selector, date_selector, text_selector):
    if link.startswith('http'):
        link = link
    elif link.startswith('/'):
        link = main_url + link
    else:
        link = main_url + '/' + link
    try:
        response = requests.get(link, headers=headers, timeout=5)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error requesting {link}: {e}")
        return {'title':'N/A','author':'N/A','date': 'N/A', 'text': 'N/A', 'link': link}
    soup = BeautifulSoup(response.text, 'html.parser')
    date_element = soup.select_one(date_selector)
    if date_element:
        date = date_element.text
    else:
        date = 'N/A'
    title_element = soup.select_one(title_selector)
    if title_element:
        title = title_element.text
    else:
        title = 'N/A'
    author_element = soup.select_one(author_selector)
    if author_element:
        author = author_element.text
    else:
        author = 'N/A'
    text_elements = soup.select(text_selector)
    text = ' '.join([text_element.text.strip() for text_element in text_elements])
    print(f"collected {link} articles")
    return {'title':title,'author':author,'date': date, 'text': text, 'link': link}



def scrape_article_wrapper(args):
    main_url, href, title_selector, author_selector, date_selector, text_selector = args
    return scrape_article(main_url, href, title_selector, author_selector, date_selector, text_selector)

def scrape_articles(end, main_url, page_url_template, element_selector, title_selector, author_selector, date_selector, text_selector):
    all_hrefs = []
    for page_num in range(1, end+1):
        hrefs = scrape_page(page_url_template, element_selector, page_num)
        all_hrefs.extend(hrefs)
        print(len(all_hrefs))
    print("total articles found:", len(all_hrefs))

    if 'www.' in main_url:
        file_title = main_url.split('www.')[1].split('.')[0]
    else:
        file_title = main_url.split('//')[1].split('.')[0]

    # hrefs = pd.DataFrame(all_hrefs)
    # hrefs.to_csv(f'/content/drive/MyDrive/EY 2022-2023 Junior/Spring 2023/SOCSC-UH 2213 Textual Analysis/textual analysis final project/data/scraping 30mar/{file_title}_hrefs.csv')

    all_data = []
    progress_counter = 0
    # if 'www.' in main_url:
    #     file_title = main_url.split('www.')[1].split('.')[0]
    # else:
    #     file_title = main_url.split('//')[1].split('.')[0]
    for i, data in enumerate(pool.imap_unordered(scrape_article_wrapper, [(main_url, href, title_selector, author_selector, date_selector, text_selector) for href in all_hrefs])):
    # for i, data in enumerate(pool.imap_unordered(scrape_article_wrapper, [(main_url, href, title_selector, author_selector, date_selector, text_selector) for href in all_hrefs[22000:]]), start = 22000):
        all_data.append(data)
        progress_counter += 1
        if progress_counter % 1000 == 0:
            df = pd.DataFrame(all_data)
            df.to_csv(f'/content/drive/MyDrive/EY 2022-2023 Junior/Spring 2023/SOCSC-UH 2213 Textual Analysis/textual analysis final project/data/scraping 30mar/processing/{file_title}_{i+1}.csv', index=False)
            all_data = []
            print(f'processed {progress_counter} out of {len(all_hrefs)} articles ({progress_counter/len(all_hrefs)*100:.2f}%)')
    if all_data:
        df = pd.DataFrame(all_data)
        df.to_csv(f'/content/drive/MyDrive/EY 2022-2023 Junior/Spring 2023/SOCSC-UH 2213 Textual Analysis/textual analysis final project/data/scraping 30mar/processing/{file_title}_{i+2}.csv', index=False)
        print(f'processed {len(all_hrefs)} out of {len(all_hrefs)} articles (100.00%)')

In [4]:
folder_path = '/content/drive/MyDrive/EY 2022-2023 Junior/Spring 2023/SOCSC-UH 2213 Textual Analysis/textual analysis final project/data/scraping 30mar/'
scraping = pd.read_csv(folder_path+'us scraping.csv')
scraping

FileNotFoundError: ignored

In [None]:
scraping['page_url_template'][10]= 'https://commonwealthtimes.org/page/{}/?s'
# scraping['article_url'][36]= "[itemprop='name'] a"

In [None]:
for i in [33,37]:
    base_url = scraping['page_url_template'][i]
    link_selector = scraping['article_url'][i]
    page_num = int(scraping['page'][i])

    main_url = scraping['website'][i]
    date_selector = scraping['date'][i]
    text_selector = scraping['text'][i]
    title_selector = scraping['title'][i]
    author_selector = scraping['author'][i]

    if __name__ == '__main__':
        with Pool(20) as pool:
            scrape_articles(page_num, main_url, base_url, link_selector, title_selector, author_selector, date_selector, text_selector)

    # Set the path of the Google Drive folder containing the CSV files
    folder_path = '/content/drive/MyDrive/EY 2022-2023 Junior/Spring 2023/SOCSC-UH 2213 Textual Analysis/textual analysis final project/data/scraping 30mar/processing/'

    dfs = []
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)
            dfs.append(df)
    result_df = pd.concat(dfs, axis=0, ignore_index=True)

    # Print the resulting dataframe
    print(result_df)

    if 'www.' in main_url:
        file_title = main_url.split('www.')[1].split('.')[0]
    else:
        file_title = main_url.split('//')[1].split('.')[0]

    result_df.to_csv(f'/content/drive/MyDrive/EY 2022-2023 Junior/Spring 2023/SOCSC-UH 2213 Textual Analysis/textual analysis final project/data/scraping 30mar/{file_title}.csv')

    os.chdir(folder_path)

    for file in glob.glob("*.csv"):
        os.remove(file)


# 2. New York Times articles
API

In [None]:
api_key = "vfcT8nOErUx9l6UU3Ak1ORwLAGBkkGJ9"
base_url = "https://api.nytimes.com/svc/"
endpoint = "search/v2/articlesearch.json"
query = "politics"
begin_date = "2020-01-01"
end_date = "2023-05-31"
num_pages = 30  # Number of pages to retrieve (each page returns up to 10 articles)
articles = []

for page in range(num_pages):
    url = f"{base_url}{endpoint}?api-key={api_key}&q={query}&begin_date={begin_date}&end_date={end_date}&page={page + 1}"

    response = requests.get(url,headers=headers)
    data = response.json()

    for article in data["response"]["docs"]:
        headline = article["headline"]["main"] if "main" in article["headline"] else ""
        pub_date = article["pub_date"] if "pub_date" in article else ""
        author = article["byline"]["original"] if "byline" in article else ""
        link = article["web_url"] if "web_url" in article else ""

        response = requests.get(article['web_url'], headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the HTML element that contains the article text
        article_body = soup.find('section', {'name': 'articleBody'})
        # Extract the text from the article body element
        if article_body == None:
          text = ''
        else:
          text = article_body.get_text(separator=' ')

        articles.append({
            "Headline": headline,
            "Published Date": pub_date,
            "Author": author,
            "Text": text
        })
        time.sleep(5)

politics = pd.DataFrame(articles)
print(politics)

                                              Headline  \
0    What a New Oral History Reveals About Obama, a...   
1    Chris Christie Gets a Super PAC Ahead of His L...   
2    A Small Town’s Tragedy, Distorted by Trump’s M...   
3    Packing Figures and Charts, Roy Leads G.O.P. R...   
4    House Passes Debt Limit Bill in Bipartisan Vot...   
..                                                 ...   
295  In Blow to DeSantis, Florida Bills to Limit Pr...   
296  Jim Marchant, a Nevada Election Denier, Announ...   
297          Ajay Banga Confirmed as World Bank Leader   
298  Rep. Colin Allred of Texas Will Challenge Ted ...   
299  How U.S. Efforts to Guide Sudan to Democracy E...   

               Published Date  \
0    2023-05-31T13:00:24+0000   
1    2023-05-30T07:00:19+0000   
2    2023-05-29T09:28:18+0000   
3    2023-05-31T09:00:27+0000   
4    2023-05-31T09:00:12+0000   
..                        ...   
295  2023-05-03T17:25:14+0000   
296  2023-05-03T14:37:13+0000   
297  2

In [None]:
business['topic'] = 'business'
ent['topic'] = 'entertainment'
politics['topic'] = 'politics'
sports['topic'] = 'sports'
tech['topic'] = 'technology'

# business, entertainment, politics, sport, tech
dataframes = [business, ent, politics, sports, tech]

# Concatenate the DataFrames
combined_df = pd.concat(dataframes, ignore_index=True)

# Print the combined DataFrame
print(combined_df)

                                              Headline  \
0    Franchisers, Facing Challenges to Business Mod...   
1    Fox News Settled Its Suit, but Similar 2020 El...   
2    Fox Will Pay $787.5 Million to Settle Defamati...   
3    Can a Global Talent Agency Make Atlanta an Art...   
4    How Small Businesses Can Find Safety Before th...   
..                                                 ...   
895  Bing Newcomb, Whose E*Trade Transformed Stock ...   
896  TikTok Is Fined $15.9 Million Over Misusing Ki...   
897   Want an A in His Class? You Had Better Go Viral.   
898  Jury Says Tesla Must Pay Worker $3.2 Million O...   
899  Twitter Users Are Still Waiting for a Check-Ma...   

               Published Date                                   Author  \
0    2023-04-20T17:30:51+0000                        By Lydia DePillis   
1    2023-04-19T18:02:37+0000                           By Lora Kelley   
2    2023-04-18T23:54:45+0000  By Jeremy W. Peters and Katie Robertson   
3    20

In [None]:
path = '/content/drive/MyDrive/EY 2022-2023 Junior/Spring 2023/SOCSC-UH 2213 Textual Analysis/textual analysis final project/data/validation/'

past = pd.read_csv(path+'nyt.csv')

politics['topic'] = 'politics'
sports['topic'] = 'sports'

# business, entertainment, politics, sport, tech
dataframes = [politics, sports]

# Concatenate the DataFrames
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df['source'] = 'nyt'

# Print the combined DataFrame
result = pf.concat([past, combined_df], ignore_index=True)
result

In [None]:
result.to_csv(path+'nyt.csv',index=False)

In [None]:
combined_df['source'] = 'nyt'
combined_df.to_csv('/content/drive/MyDrive/EY 2022-2023 Junior/Spring 2023/SOCSC-UH 2213 Textual Analysis/textual analysis final project/data/validation/nyt.csv')

# 3. The Guardian articles

In [None]:
api_key = 'c6b98077-28c5-486e-89a9-c691a4986cc4'

base_url = "https://content.guardianapis.com/"
endpoint = "search"
section = "sport"
from_date = "2023-01-01"
to_date = "2023-05-31"
num_pages = 30
articles = []

for page in range(num_pages):
    url = f"{base_url}{endpoint}?api-key={api_key}&from-date={from_date}&to-date={to_date}&page={page+1}&show-fields=headline,publication,body,byline&section={section}"

    response = requests.get(url, headers=headers)
    data = response.json()

    for article in data["response"]["results"]:
        title = article["webTitle"]
        date = article["webPublicationDate"]
        if 'byline' in article['fields']:
            author = article['fields']['byline']
        else:
            author = "Unknown"

        text = article["fields"]["body"]

        articles.append({
            "Headline": title,
            "Published Date": date,
            "Author": author,
            "Text": text
        })

sports = pd.DataFrame(articles)
print(sports)

                                              Headline        Published Date  \
0    Evans suffers ‘shocking’ French Open first rou...  2023-05-28T18:44:04Z   
1    Rublev in action, Evans out, Tsitsipas through...  2023-05-28T18:01:52Z   
2    Saracens’ sunkissed win over Sale tempered by ...  2023-05-28T17:00:37Z   
3    Mark Cavendish wins final stage of Giro d’Ital...  2023-05-28T16:56:46Z   
4                           Giotto Bizzarrini obituary  2023-05-28T16:33:59Z   
..                                                 ...                   ...   
295  The bravest thing Anthony Joshua can do is ret...  2023-05-02T08:00:02Z   
296  Nothing plagues LeBron James like Stephen Curr...  2023-05-02T07:30:02Z   
297  Nick Kyrgios’s Tesla allegedly stolen from mot...  2023-05-02T07:12:43Z   
298  Sleeping pills, thrills and a new king: the in...  2023-05-02T07:00:01Z   
299  Kudermetova will remove Russian sponsorship to...  2023-05-01T22:47:26Z   

                                Author 

In [None]:
business['topic'] = 'business'
ent['topic'] = 'entertainment'
politics['topic'] = 'politics'
sports['topic'] = 'sports'
tech['topic'] = 'technology'

# business, entertainment, politics, sport, tech
dataframes = [business, ent, politics, sports, tech]

# Concatenate the DataFrames
combined_df = pd.concat(dataframes, ignore_index=True)

# Print the combined DataFrame
print(combined_df)

                                               Headline        Published Date  \
0     UK ministers discuss voluntary price limits fo...  2023-05-28T15:44:19Z   
1     Terrible news for Sunak and Hunt puts election...  2023-05-28T09:12:17Z   
2     Even Farage says Brexit has failed. Why won’t ...  2023-05-28T06:00:24Z   
3     What is the US debt ceiling and what would hap...  2023-05-28T02:12:14Z   
4     Rail strikes: Hopes of a resolution have been ...  2023-05-27T23:05:15Z   
...                                                 ...                   ...   
1495  Near 50% fall in Silvergate’s shares over FTX ...  2023-03-02T17:10:12Z   
1496  ‘They’re more concerned about profit’: Osha, D...  2023-03-02T07:00:05Z   
1497  Last night AI DJ saved my life? Testing Spotif...  2023-03-02T06:00:06Z   
1498  NSO Group co-founder emerges as new majority o...  2023-03-01T19:54:46Z   
1499  House committee advances legislation to ban Ti...  2023-03-01T18:01:16Z   

                           

In [None]:
combined_df['source'] = 'guardian'
combined_df.to_csv('/content/drive/MyDrive/EY 2022-2023 Junior/Spring 2023/SOCSC-UH 2213 Textual Analysis/textual analysis final project/data/validation/guardian.csv')