## Save articles as txt

This jupyter notebook scrapes the text of news articles off the web (URLs are contained in a csv file) and saves them as txt files.

### Setup

In [33]:
import pandas as pd
import numpy as no
import os
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

In [34]:
os.getcwd()

'/Users/albertomicheletti/Documents/Hackaton'

In [35]:
os.chdir("/Users/albertomicheletti/Documents/Hackaton")

In [36]:
articles_txt_path = os.getcwd() + "/articles_txt"

### Import dataset

In [37]:
articles = pd.read_csv(os.getcwd() + "/Heatwaves-V1_pandas.csv")

In [38]:
# # NOT THE RIGHT SOLUTION
# # Add a column with title.txt
# articles['Title txt'] = 'Article ' + articles['Title'].astype(str) + '.txt' 

In [39]:
# create an article ID column
articles['Article_ID'] = articles.index

# Change dtype to object  
articles['Article_ID'] = articles['Article_ID'].astype('object')   

# Convert values to strings
articles['Article_ID'] = articles['Article_ID'].apply(str) 

# Concatenate with string
articles['Article_ID'] = 'ART-' + articles['Article_ID']

In [40]:
#articles

In [41]:
# Only UK articles
articles_UK = articles.loc[articles["Country"] == "United Kingdom",:].reset_index().copy()

In [42]:
# More than 2 mil visitors
articles_UK_2mil = articles_UK.loc[articles_UK["Total Monthly Visitors"] > 2000000, :].reset_index().copy()

In [43]:
articles_UK_2mil.shape

(3585, 14)

### Add political leaning

In [44]:
articles_UK_2mil["Domain"].unique()

array(['telegraph.co.uk', 'mirror.co.uk', 'bbc.co.uk', 'yahoo.com',
       'player.fm', 'timeout.com', 'itv.com', 'theguardian.com',
       'aol.co.uk', 'bbc.com', 'dailymail.co.uk', 'express.co.uk',
       'liverpoolecho.co.uk', 'investing.com', 'huffingtonpost.co.uk',
       'metro.co.uk', 'dailystar.co.uk', 'sky.com',
       'manchestereveningnews.co.uk', 'lse.ac.uk', 'standard.co.uk',
       'independent.co.uk', 'thetimes.co.uk', 'ladbible.com',
       'euronews.com', 'tportal.hr', 'nature.com', 'phys.org',
       'youtube.com', 'wordpress.com', 'hurriyet.com.tr', 'cnbc.com',
       'pistonheads.com', 'mumsnet.com', 'nottingham.ac.uk',
       'theconversation.com', 'thesun.co.uk', 'thestudentroom.co.uk',
       'channel4.com', 'bmj.com', 'www.gov.uk', 'theweathernetwork.com',
       'worldcat.org', 'indy100.com', 'iflscience.com', 'marca.com',
       'cnn.com', 'zerohedge.com', 'autosport.com', 'radiotimes.com',
       'wpengine.com', 'halifax.co.uk', 'boredpanda.com'], dtype=objec

In [45]:
# fed this into Claude
# The prompt was "create a pandas dataframe with the news sources below as one column and their political leaning as the other column"
# then I asked: Can you write it as a dictionary

In [46]:
news_dict = {
  'telegraph.co.uk': 'Right',
  'mirror.co.uk': 'Left',
  'bbc.co.uk': 'Centre',
  'yahoo.com': 'No leaning',
  'player.fm': 'No leaning',
  'timeout.com': 'No leaning',
  'itv.com': 'Centre',
  'theguardian.com': 'Left/Centre-left',
  'aol.co.uk': 'No leaning',
  'bbc.com': 'Centre',
  'dailymail.co.uk': 'Right',
  'express.co.uk': 'Right',
  'liverpoolecho.co.uk': 'Centre',
  'investing.com': 'No leaning',
  'huffingtonpost.co.uk': 'Left/Centre-left',
  'metro.co.uk': 'Centre-left',
  'dailystar.co.uk': 'Right',
  'sky.com': 'Centre',
  'manchestereveningnews.co.uk': 'Centre',
  'lse.ac.uk': 'No leaning',
  'standard.co.uk': 'Centre-right',
  'independent.co.uk': 'Centre-left',
  'thetimes.co.uk': 'Centre-right',
  'ladbible.com': 'No leaning',
  'euronews.com': 'No leaning',
  'tportal.hr': 'No leaning',
  'nature.com': 'No leaning',
  'phys.org': 'No leaning',
  'youtube.com': 'No leaning',
  'wordpress.com': 'No leaning',
  'hurriyet.com.tr': 'No leaning',
  'cnbc.com': 'Centre',
  'pistonheads.com': 'No leaning',
  'mumsnet.com': 'No leaning',
  'nottingham.ac.uk': 'No leaning',
  'theconversation.com': 'Centre-left',
  'thesun.co.uk': 'Right',
  'thestudentroom.co.uk': 'No leaning',
  'channel4.com': 'No leaning',
  'bmj.com': 'No leaning',
  'www.gov.uk': 'No leaning',
  'theweathernetwork.com': 'No leaning',
  'worldcat.org': 'No leaning',
  'indy100.com': 'No leaning',
  'iflscience.com': 'No leaning',
  'marca.com': 'No leaning',
  'cnn.com': 'Centre',
  'zerohedge.com': 'No leaning',
  'autosport.com': 'No leaning',
  'radiotimes.com': 'No leaning',
  'wpengine.com': 'No leaning',
  'halifax.co.uk': 'No leaning',
  'boredpanda.com': 'No leaning'
}

In [47]:
def get_leaning(domain):
  return news_dict.get(domain, 'Unknown')

In [48]:
articles_UK_2mil['Leaning'] = articles_UK_2mil['Domain'].apply(get_leaning)

In [49]:
articles_UK_2mil['Leaning'].unique()

array(['Right', 'Left', 'Centre', 'No leaning', 'Left/Centre-left',
       'Centre-left', 'Centre-right'], dtype=object)

In [50]:
def simplify_leaning(leaning):
  if leaning == 'Centre':
    return 'Centre'
  elif leaning in ['Right','Centre-right']: 
    return 'Right'
  elif leaning in ['Left', 'Left/Centre-left', "Centre-left"]:
    return 'Left'
  else:
    return leaning
  
articles_UK_2mil['Leaning_simple'] = articles_UK_2mil['Leaning'].apply(simplify_leaning)

In [51]:
# Four category leaning
articles_UK_2mil['Leaning_simple'].unique()

array(['Right', 'Left', 'Centre', 'No leaning'], dtype=object)

In [52]:
# Balanced sample VERSION 1
# Get unique leanings  
leanings = articles_UK_2mil['Leaning_simple'].unique()

# Sample 10 articles per leaning
balanced_sample = pd.DataFrame()
for l in leanings:
    temp = articles_UK_2mil[articles_UK_2mil['Leaning_simple'] == l].sample(n=10, random_state=1)
    balanced_sample = pd.concat([balanced_sample, temp])

# Shuffle the rows    
balanced_sample = balanced_sample.sample(frac=1)#.reset_index(drop=True)

# Take first 10 rows
balanced_sample = balanced_sample.iloc[:10]

# Create a list
balanced_list = balanced_sample.index.to_list()

In [53]:
# Balanced sample VERSION 2
# Get unique leanings  
leanings = articles_UK_2mil['Leaning_simple'].unique()

# Sample 10 articles per leaning
balanced_sample = pd.DataFrame()
for l in leanings:
    temp = articles_UK_2mil[articles_UK_2mil['Leaning_simple'] == l].sample(n=2, random_state=12)
    balanced_sample = pd.concat([balanced_sample, temp])

# # Shuffle the rows    
# balanced_sample = balanced_sample.sample(frac=1)#.reset_index(drop=True)

# # Take first 10 rows
# balanced_sample = balanced_sample.iloc[:10]

# Create a list
balanced_list = balanced_sample.index.to_list()

In [54]:
balanced_list

[971, 1556, 1700, 3020, 2348, 1151, 1596, 677]

In [55]:
len(balanced_list)

8

In [56]:
balanced_sample

Unnamed: 0,level_0,index,Query Id,Query Name,Date,Title,Url,Domain,Sentiment,Language,Country,Full Text,Total Monthly Visitors,Article_ID,Leaning,Leaning_simple
971,5154,14149,2001017906,Heatwaves,02:33.0,BBC climate editor accused of hypocrisy as he ...,http://www.telegraph.co.uk/news/2023/07/19/jus...,telegraph.co.uk,negative,en,United Kingdom,"By Patrick Sawer, Senior News Reporter 19 July...",44570000.0,ART-14149,Right,Right
1556,6838,20105,2001017906,Heatwaves,06:00.0,List of countries impacted by Europe heatwave ...,http://ct.moreover.com/?a=51326477524&p=2p3&v=...,dailystar.co.uk,neutral,en,United Kingdom,"Due to licensing restrictions, this mention ca...",11500000.0,ART-20105,Right,Right
1700,7194,21541,2001017906,Heatwaves,00:00.0,Met Office gives update on whether UK will hit...,http://ct.moreover.com/?a=51326058462&p=2p3&v=...,independent.co.uk,neutral,en,United Kingdom,"Due to licensing restrictions, this mention ca...",41168000.0,ART-21541,Centre-left,Left
3020,11751,36699,2001017906,Heatwaves,00:00.0,El Nino could intensify record-breaking heat a...,http://ct.moreover.com/?a=51295880008&p=2p3&v=...,independent.co.uk,neutral,en,United Kingdom,"Due to licensing restrictions, this mention ca...",41168000.0,ART-36699,Centre-left,Left
2348,9356,28695,2001017906,Heatwaves,00:00.0,"BBCNEEU, Jul 16, 2023 01:00 PM GMT - BBC News",http://ct.moreover.com/?a=51311871996&p=2p3&v=...,bbc.com,positive,en,United Kingdom,"Due to licensing restrictions, this mention ca...",205970000.0,ART-28695,Centre,Centre
1151,5734,16397,2001017906,Heatwaves,01:00.0,"BBC1SE, Jul 19, 2023 04:01 AM BST - BBC News",http://ct.moreover.com/?a=51332039902&p=2p3&v=...,bbc.co.uk,neutral,en,United Kingdom,"Due to licensing restrictions, this mention ca...",530000000.0,ART-16397,Centre,Centre
1596,6944,20603,2001017906,Heatwaves,27:16.0,Brits travelling to Europe given updated trave...,https://uk.sports.yahoo.com/news/brits-travell...,yahoo.com,negative,en,United Kingdom,The UK Foreign Office has updated its travel a...,77000000000.0,ART-20603,No leaning,No leaning
677,4022,9926,2001017906,Heatwaves,44:55.0,Watch as wildfires continue to rage in Greece ...,http://ct.moreover.com/?a=51352047384&p=2p3&v=...,yahoo.com,neutral,en,United Kingdom,"Due to licensing restrictions, this mention ca...",77000000000.0,ART-9926,No leaning,No leaning


In [57]:
# Extract the Url column  
urls = balanced_sample['Url']

# Write the URLs to a text file  
with open('urls.txt', 'w') as f:
    for url in urls:
        f.write(url + '\n')
        
print('URLs written to urls.txt')

URLs written to urls.txt


### Subarticles

In [58]:
subarticles = articles_UK_2mil.iloc[balanced_list]

### Scraping proper

In [59]:
# import requests
# from bs4 import BeautifulSoup
# from datetime import datetime

# def save_article_to_txt(url, filename):
#     try:
#         # Set the timeout to 60 seconds (1 minute)
#         response = requests.get(url, timeout=60)
#         response.raise_for_status()
#         soup = BeautifulSoup(response.text, 'html.parser')
#         article_text = ''
#         for paragraph in soup.find_all('p'):
#             article_text += paragraph.get_text() + '\n'
        
#         folder_name = os.getcwd() + "/articles_txt"
#         os.makedirs(folder_name, exist_ok=True)  # Create the folder if it doesn't exist
#         file_path = os.path.join(folder_name, filename + ".txt")
        
#         with open(file_path, 'w', encoding='utf-8') as file:
#             file.write(article_text)
#         return 1  # Success flag: 1 indicates successful retrieval and save
#     except requests.exceptions.RequestException as e:
#         print(f"Error fetching the webpage: {e}")
#         return 0  # Success flag: 0 indicates unsuccessful retrieval

In [60]:
# # Scrape multiple articles
# subarticles['Success'] = subarticles.apply(lambda row: save_article_to_txt(row['Url'], row['Article_ID']), axis=1)

In [61]:
import requests
from bs4 import BeautifulSoup
import os

def save_article_to_txt(url, filename, title, domain):
    try:
        # Set the timeout to 60 seconds (1 minute)
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        article_text = ''
        for paragraph in soup.find_all('p'):
            article_text += paragraph.get_text() + '\n'
        
        # Combine title, domain, and article text
        article_content = f"Title: {title}\nDomain: {domain}\n\n{article_text}"

        folder_name = os.getcwd() + "/articles_txt"
        os.makedirs(folder_name, exist_ok=True)  # Create the folder if it doesn't exist
        file_path = os.path.join(folder_name, filename + ".txt")
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(article_content)
        return 1  # Success flag: 1 indicates successful retrieval and save
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return 0  # Success flag: 0 indicates unsuccessful retrieval

In [62]:
# Scrape multiple articles
subarticles['Success'] = subarticles.apply(lambda row: save_article_to_txt(row['Url'], row['Article_ID'], row['Title'], row['Domain']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subarticles['Success'] = subarticles.apply(lambda row: save_article_to_txt(row['Url'], row['Article_ID'], row['Title'], row['Domain']), axis=1)


In [63]:
subarticles["Success"]

971     1
1556    1
1700    1
3020    1
2348    1
1151    1
1596    1
677     1
Name: Success, dtype: int64

In [64]:
import os
from os import path

folder = articles_txt_path

# Get list of .txt files in folder
files = [f for f in os.listdir(folder) if f.endswith('.txt')] 

with open('combined.txt', 'w') as outfile:
    for fname in files:
        filepath = path.join(folder, fname)
        with open(filepath) as infile:
            for line in infile:
                outfile.write(line)
        outfile.write("\n") # blank line between files
        
print("Files concatenated successfully!")

Files concatenated successfully!


### Various

In [71]:
articles.loc[8012, "Url"]

'https://www.dailymail.co.uk/wires/reuters/article-12323727/WHO-warns-dengue-risk-global-warming-pushes-cases-near-historic-highs.html'