---
## News Scraping by parallelization
    In this scrip I built a function to scrape news using the library "newspaper"  https://github.com/codelucas/newspaper/ and I apply it over the urls downloaded in Media-Cloud. 
    
---

In [7]:
import time
import pandas as pd
from newspaper import Article
import requests
from concurrent.futures import ThreadPoolExecutor ## parallelization

In [26]:
def fetch_articles_new(ids, urls):
    article_data = {
        'ID': [],
        'URL': [],
        'Body': [],
        'MetaData': [],
        'Authors': [],
        'Date': [],
        'Title': [],
        'Tags': [],
        'MetaKeywords': [],
        'Summary': [],
        'Error': [],  # New column for error messages
    }

    errors = []  # List to store error messages
    
    start_time = time.time()  # Record the start time
    
    for id_, url in zip(ids, urls):
        try:
            article = Article(url)
            article.download()
            article.parse()
            
            # Append the parsed article to the dictionary
            article_data['ID'].append(id_)
            article_data['URL'].append(url)
            article_data['Body'].append(article.text)
            article_data['MetaData'].append(article.meta_description)
            article_data['Authors'].append(article.authors)
            article_data['Date'].append(article.publish_date)
            article_data['Title'].append(article.title)
            article_data['Tags'].append(article.tags)
            article_data['MetaKeywords'].append(article.meta_keywords)
            
            article.nlp()
            article_data['Summary'].append(article.summary)
            
            # No error, so add an empty string to the 'Error' column
            article_data['Error'].append('')
            
            # Introduce a sleep to avoid being blocked
            time.sleep(3)  # Sleep for 3 seconds
            
        
        except requests.exceptions.RequestException as e:
            # Handle URL-related exceptions
            article_data['Error'].append(str(e))
            errors.append(str(e))  # Append error to the list
            
            # Append 'ID' and 'URL'
            article_data['ID'].append(id_)
            article_data['URL'].append(url)
            
            # Append placeholders for other columns
            for column in ['Body', 'MetaData', 'Authors', 'Date', 'Title', 'Tags', 'MetaKeywords', 'Summary']:
                article_data[column].append('NA')
                
        except Exception as e:
            # Handle other exceptions (e.g., parsing errors)
            article_data['Error'].append(str(e))
            errors.append(str(e))  # Append error to the list
            
            # Append 'ID' and 'URL'
            article_data['ID'].append(id_)
            article_data['URL'].append(url)
            
            # Append placeholders for other columns
            for column in ['Body', 'MetaData', 'Authors', 'Date', 'Title', 'Tags', 'MetaKeywords', 'Summary']:
                article_data[column].append('NA')
    
    end_time = time.time()  # Record the end time
    
    elapsed_time = end_time - start_time  # Calculate the elapsed time
    
    print(f"Time taken to fetch and parse articles: {elapsed_time:.2f} seconds")
    
    return pd.DataFrame(article_data), errors

In [144]:
pkl_url = pd.read_pickle("/Users/trinidadbosch/Desktop/SEDS/Tesis/Data/MA-Thesis/Media Cloud/Data/media_urls.pkl")

In [145]:
pkl_url.columns

Index(['ap_syndicated', 'collect_date', 'feeds', 'guid', 'language',
       'media_id', 'media_name', 'media_url', 'metadata',
       'processed_stories_id', 'publish_date', 'stories_id', 'story_tags',
       'title', 'url', 'word_count'],
      dtype='object')

In [146]:
pkl_url.publish_date

0         2021-02-01 03:11:18
1         2021-02-01 13:28:31
2         2021-02-01 13:17:36
3         2021-02-01 02:05:19
4         2021-02-01 14:07:26
                 ...         
192989    2023-11-01 17:21:46
192990    2023-10-30 19:46:44
192991    2023-11-01 18:00:00
192992    2023-10-31 18:00:00
192993    2023-11-01 09:00:00
Name: publish_date, Length: 192994, dtype: object

---
### Apply scraping function in paralell
    This process took ~20hrs
---

In [31]:
# Specify the number of chunks
num_chunks = 100

# Calculate the size of each chunk
chunk_size = len(pkl_url) // num_chunks

# Split the DataFrame into chunks
df_chunks = [pkl_url.iloc[i:i + chunk_size] for i in range(0, len(pkl_url), chunk_size)]

# Function to apply in parallel
def process_chunk(chunk):
    chunk_ids = chunk['stories_id'].tolist()
    chunk_urls = chunk['url'].tolist()
    
    return fetch_articles_new(chunk_ids, chunk_urls)

# Apply the function in parallel using ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    results = list(executor.map(process_chunk, df_chunks))

Time taken to fetch and parse articles: 3784.77 seconds
Time taken to fetch and parse articles: 4202.48 seconds
Time taken to fetch and parse articles: 5815.62 seconds
Time taken to fetch and parse articles: 5924.41 seconds
Time taken to fetch and parse articles: 6227.64 seconds
Time taken to fetch and parse articles: 6367.67 seconds
Time taken to fetch and parse articles: 6432.34 seconds
Time taken to fetch and parse articles: 6619.20 seconds
Time taken to fetch and parse articles: 5454.46 seconds
Time taken to fetch and parse articles: 3411.00 seconds
Time taken to fetch and parse articles: 3689.31 seconds
Time taken to fetch and parse articles: 7391.17 seconds
Time taken to fetch and parse articles: 6684.15 seconds
Time taken to fetch and parse articles: 6778.27 seconds
Time taken to fetch and parse articles: 6661.36 seconds
Time taken to fetch and parse articles: 6498.51 seconds
Time taken to fetch and parse articles: 4893.06 seconds
Time taken to fetch and parse articles: 4665.54 

Building prefix dict from /opt/anaconda3/lib/python3.8/site-packages/jieba/dict.txt ...
Dumping model to file cache /var/folders/vk/yr_rw4912x38st74np992k1r0000gn/T/jieba.cache
Loading model cost 3.11118221282959 seconds.
Prefix dict has been built succesfully.


Time taken to fetch and parse articles: 4385.26 seconds
Time taken to fetch and parse articles: 3492.28 seconds
Time taken to fetch and parse articles: 4274.16 seconds
Time taken to fetch and parse articles: 5542.20 seconds
Time taken to fetch and parse articles: 3882.53 seconds
Time taken to fetch and parse articles: 6708.33 seconds
Time taken to fetch and parse articles: 6249.68 seconds
Time taken to fetch and parse articles: 6659.23 seconds
Time taken to fetch and parse articles: 6318.34 seconds
Time taken to fetch and parse articles: 6664.81 seconds
Time taken to fetch and parse articles: 3367.04 seconds
Time taken to fetch and parse articles: 3321.14 seconds
Time taken to fetch and parse articles: 5339.08 seconds
Time taken to fetch and parse articles: 6585.64 seconds
Time taken to fetch and parse articles: 3748.51 seconds
Time taken to fetch and parse articles: 3830.97 seconds
Time taken to fetch and parse articles: 3487.06 seconds
Time taken to fetch and parse articles: 4997.63 

In [32]:
# Extract DataFrames from the tuple results
result_dataframes = [result[0] for result in results]

# Concatenate the DataFrames into a single DataFrame
final_result = pd.concat(result_dataframes, ignore_index=True)

# Display the final DataFrame
(final_result)

Unnamed: 0,ID,URL,Body,MetaData,Authors,Date,Title,Tags,MetaKeywords,Summary,Error
0,1838925040,https://news.mongabay.com/2021/02/eye-in-the-s...,An interdisciplinary team of zoologists and co...,Environmental science and conservation news,[Terna Gyuse],2021-02-01 08:11:18+00:00,Eye in the Sky: Tech makes satellite imagery i...,"{Wildlife, Animals, Jim Tan, Endangered Specie...",[],Satellite surveying still offers many advantag...,
1,1839017422,https://www.cnbc.com/2021/02/01/amazon-alphabe...,"Ali Ghodsi, co-founder and CEO of Databricks I...","Amazon is getting involved in the start-up, Da...",[Jordan Novet],2021-02-01 00:00:00,"Amazon, Alphabet and Salesforce are all invest...",{},"[Start-up, Venture capital, Microsoft Corp, Sa...","Ali Ghodsi, co-founder and CEO of Databricks I...",
2,1839009079,http://feeds.benzinga.com/~r/benzinga/~3/JWduJ...,One of the hottest names in the investing worl...,One of the hottest names in the investing worl...,[Chris Katje],,15 Big Ideas In 'Disruptive Innovation' Accord...,"{Government, Digital Wallets, Healthcare, Poli...",[],One of the hottest names in the investing worl...,
3,1838767296,https://eurweb.com/2021/01/31/black-creators-a...,*It’s day four at the 2021 Sundance Film Festi...,,[Olivia T.],2021-01-31 00:00:00,Black Creators At Sundance 2021,"{white wedding, sundance, Sophia Nahli Allison...",[],*It’s day four at the 2021 Sundance Film Festi...,
4,1839090241,http://feeds.benzinga.com/~r/benzinga/~3/y-hmu...,Loading... Loading...\n\nBill.com BILL shares ...,Bill.com (NYSE: BILL) shares are trading highe...,[Tanzeel Akhtar],,Why Bill.com's Stock Is Trading Higher Today -...,"{Government, Healthcare, Politics, Regulations...",[],Loading... Loading...Bill.com BILL shares are ...,
...,...,...,...,...,...,...,...,...,...,...,...
192989,2816117787,https://arstechnica.com/?p=1980413,"On Wednesday, the UK hosted an AI Safety Summi...","""Bletchley Declaration"" sums up first day of U...",[Benj Edwards],2023-11-01 21:21:46+00:00,“Catastrophic” AI harms among warnings in decl...,{},[],"The event included the signing of ""The Bletchl...",
192990,2816156060,https://www.cnbc.com/2023/10/31/stock-markets-...,Traders work on the floor of the New York Stoc...,After a rough week amid accelerating inflation...,[Yeo Boon Ping],2023-10-31 00:00:00,CNBC Daily Open: Markets’ bounce may be short-...,{},"[Technology, Autos, Morgan Stanley, Meta Platf...","This report is from today's CNBC Daily Open, o...",
192991,2818282751,https://doakio.com/blog/missteps-in-blockchain...,Introduction\n\nIn the world of blockchain tec...,,"[Rachele Augusto, Var Molongui_Authorship_Fron...",2023-11-01 22:00:00+00:00,Missteps in Blockchain Documentation: A Review,{Uncategorized},[],The Importance of Accurate Blockchain Document...,
192992,2818282763,https://doakio.com/blog/why-jargon-isnt-always...,Introduction\n\nWelcome to the world of techni...,,"[Rachele Augusto, Var Molongui_Authorship_Fron...",2023-10-31 22:00:00+00:00,Why Jargon Isn’t Always the Enemy: Technical v...,{Uncategorized},[],The Pitfalls of Over-Simplifying Technical Ter...,


In [33]:
# Save to CSV
final_result.to_csv('/Users/trinidadbosch/Desktop/SEDS/Tesis/Data/MA-Thesis/Media Cloud/Data/scraped_news.csv', index=False)

# Save to pickle
final_result.to_pickle('/Users/trinidadbosch/Desktop/SEDS/Tesis/Data/MA-Thesis/Media Cloud/Data/scraped_news.pkl')

In [43]:
# Identify non-empty cells in the specified column
error_rows = final_result[final_result['Body'] == 'NA']

In [161]:
error_rows

NameError: name 'error_rows' is not defined

---
## Scraping with BS
    Effort to get those urls that threw error with the previous function
---

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
urls = ['https://www.seattletimes.com/business/stellantis-foxconn-team-up-to-make-cars-more-connected/?utm_source=RSS&utm_medium=Referral&utm_campaign=RSS_all',
 'https://www.thestreet.com/press-releases/u-s-air-force-awards-booz-allen-950m-idiq-contract-15412250',
 'https://abcnews.go.com/International/wireStory/british-political-candidate-artificial-intelligence-draw-election-manifesto-101487427']
ids = ['1934724599', '1697865428','2717045621']

In [17]:
ids_

[1934724599,
 1697865428,
 2717045621,
 1834599827,
 2028472808,
 2596037950,
 1764666806,
 1503351474,
 802145235,
 956377683]

In [6]:
def bs_scrape(urls, ids_):
    article_data = {
        'ID': [],
        'URL': [],
        'Body': [],
        'Title': [],
        'Error': [],
    }

    errors = []

    start_time = time.time()

    for url, id_ in zip(urls, ids_):  # Use zip to iterate over both lists simultaneously
        try:
            # Adding a delay between requests to avoid being blocked
            time.sleep(5)
            
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
            
            soup = BeautifulSoup(response.text, 'html.parser')
            title = (soup.find('title')).get_text()
            body = soup.find_all('p')

            article_data['Title'].append(title)

            body_text = ""
            for text in body:
                body_text += text.get_text() + "\n"

            article_data['Body'].append(body_text)
            article_data['ID'].append(id_)
            article_data['URL'].append(url)
            article_data['Error'].append(None)  # No error for this entry
            
        except requests.exceptions.RequestException as e:
            errors.append(f"Error processing URL {url}: {str(e)}")
            article_data['Title'].append(None)
            article_data['Body'].append(None)
            article_data['ID'].append(id_)
            article_data['URL'].append(url)
            article_data['Error'].append(str(e))

    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Time taken to fetch and parse articles: {elapsed_time:.2f} seconds")

    if errors:
        print("Errors:")
        for error in errors:
            print(error)

    return pd.DataFrame(article_data)

In [11]:
import concurrent.futures
# Example usage:
urls = [
    'https://www.seattletimes.com/business/stellantis-foxconn-team-up-to-make-cars-more-connected/?utm_source=RSS&utm_medium=Referral&utm_campaign=RSS_all',
    'https://www.thestreet.com/press-releases/u-s-air-force-awards-booz-allen-950m-idiq-contract-15412250',
    'https://abcnews.go.com/International/wireStory/british-political-candidate-artificial-intelligence-draw-election-manifesto-101487427'
]
ids = ['1934724599', '1697865428', '2717045621']


Time taken to fetch and parse articles: 50.04 seconds
Errors:Time taken to fetch and parse articles: 50.05 seconds
Errors:
Error processing URL h: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Error processing URL t: Invalid URL 't': No schema supplied. Perhaps you meant http://t?
Error processing URL t: Invalid URL 't': No schema supplied. Perhaps you meant http://t?Time taken to fetch and parse articles: 50.05 seconds

Errors:
Error processing URL h: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Error processing URL t: Invalid URL 't': No schema supplied. Perhaps you meant http://t?
Error processing URL t: Invalid URL 't': No schema supplied. Perhaps you meant http://t?
Error processing URL p: Invalid URL 'p': No schema supplied. Perhaps you meant http://p?
Error processing URL s: Invalid URL 's': No schema supplied. Perhaps you meant http://s?
Error processing URL h: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Error processi

In [12]:
result_df

Unnamed: 0,0,1,2,3,4
0,ID,URL,Body,Title,Error
1,ID,URL,Body,Title,Error
2,ID,URL,Body,Title,Error


In [14]:
news = pd.read_csv('/Users/trinidadbosch/Desktop/SEDS/Tesis/Data/MA-Thesis/Media Cloud/Data/scraped_news.csv')
urls_raw = pd.read_pickle('/Users/trinidadbosch/Desktop/SEDS/Tesis/Data/MA-Thesis/Media Cloud/Data/media_urls.pkl')

In [15]:
nan_body_rows = news[news['Body'].isna()]
rows_work = nan_body_rows.drop_duplicates(subset='URL')

# Sample 100 rows
sampled_rows = rows_work.sample(n=10, random_state=42)  # Set a random_state for reproducibility

# Extract 'Body' and 'ID' columns from the sampled DataFrame
sampled_urls = sampled_rows['URL'].to_list()
sampled_ids = sampled_rows['ID'].to_list()

In [4]:
result_df = bs_scrape(urls, ids)

KeyboardInterrupt: 

In [17]:
# Using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(bs_scrape, sampled_urls, sampled_ids))

# Convert the list of dictionaries to a DataFrame
result_df = pd.DataFrame.from_records(results)

print(result_df)

TypeError: 'int' object is not iterable