In this program we scraped out some key data from foxnews.com and did data analysis on its news, sentimental analysis, frequency analysis and many more.

### importing dependencies

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
url = 'https://www.foxnews.com'

In [3]:
# url = "https://www.foxnews.com/"

# Parsing response using html.parser
response = requests.get(url).text
soup = BeautifulSoup(response, 'html.parser')
print("Checking if everything is working")
soup.title 

Checking if everything is working


<title>Fox News - Breaking News Updates | Latest News Headlines | Photos &amp; News Videos</title>

# Working on scraping post titles

the foxnews has the 

In [4]:
#scrape the title from post lists
news_titles = []
urls = []

if url == "https://www.foxnews.com":
    titles = soup.find_all('h3', class_= 'title')

    for title in titles:
        anchor_tag = title.find('a')
        if anchor_tag == None:
            news_titles.append(None)
            continue
        news_titles.append(anchor_tag.text.strip())

    #scrape the url
    for link in titles:
        url = link.find('a')
        if url == None:
            urls.append(None)
            continue
        urls.append(url['href'])
    for i in range(len(urls)):
        if urls[i]!=None:
            if not urls[i].startswith('https:'):
                urls[i] = 'https:' + urls[i]
    
else:
    raise Exception("This is an error raised for cause of harcoded url")


In [5]:
print("news_titles")
print(news_titles[:5],len(news_titles))
print("urls")
print(urls[:5],len(news_titles))

news_titles
["GOP lawmaker eyes backup plan for AG Garland's arrest if DOJ doesn't follow through", 'Historian with ace record calling elections says if Trump or Biden has path to victory', "NBA legend whose silhouette is famously represented on the league's logo dead at 86", "Researchers decode earliest known written record of Jesus Christ's childhood", 'Moment MLB fan tased after running onto field and doing backflip in front of officer'] 173
urls
['https://www.foxnews.com/politics/anna-paulina-luna-vows-force-vote-garlands-arrest-doj-doesnt-follow-through-contempt', 'https://www.foxnews.com/politics/who-has-keys-white-house-historian-ace-record-calling-elections-weighs-trump-verdict', 'https://www.foxnews.com/sports/nba-legend-jerry-west-dead', 'https://www.foxnews.com/world/german-researchers-decode-earliest-known-written-record-jesus-childhood', 'https://www.foxnews.com/sports/reds-fan-tased-arrested-after-running-onto-field-doing-backflip-front-officer'] 173


# scraping news corpus and category

In [6]:
#declaring metadata variables
art_text = []
news_cat = []
author_names = []
time = []



#scrape the news corpus
for url in urls:
  try:
    response = requests.get(url).text
    f_texts = BeautifulSoup(response, 'html.parser')
    main_text = f_texts.find('div', class_='article-body')
    if main_text == None:
      art_text.append(None)
      continue
    article_text = main_text.get_text()
    print(article_text.strip())
    art_text.append(article_text.strip())
  except requests.exceptions.RequestException as e:
    # Handle the exception (e.g. log an error message, skip the URL, etc.)
    print(f'Error while requesting {url}: {e}')
    art_text.append(None)
    continue
len(art_text)

#find the news category
from urllib.parse import urlparse
for url in urls:
  url_parts = urlparse(url)
  print(url)
  if url == None:
    news_cat.append(None)
    continue
  path_parts = url_parts.path.split('/')
  category = path_parts[1] if len(path_parts) > 1 else ' '
  news_cat.append(category)

  #scraping the authors
for url in urls:
  if url == None:
    author_names.append(None)
    continue
  response = requests.get(url)

  soup = BeautifulSoup(response.text, 'html.parser')

  # Find the element containing the author name
  author_element = soup.find('div', class_='author-byline')
  if author_element:
      # Extract the author name from the element
      author_name = author_element.text.strip()
      author_names.append(author_name)
  else:
      # If the author element is not found, set the author name to an empty string
      author_name = ''
      author_names.append(author_name)

# Scrape the time and date
for url in urls:
  if url == None:
    time.append(None)
    continue
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  news_dates = soup.find('time')
  if news_dates == None:
    time.append(None)
    continue
  time.append(news_dates.get_text())
print(time)


close      Video Merrick Garland is trying to blur the lines between legit DOJ criticism and coercion: Chris Landau Former Scalia and Thomas Law Clerk Chris Landau says Attorney General Merrick Garland is trying to make it seem like the DOJ is immune to criticism on 'The Ingraham Angle.' EXCLUSIVE: Rep. Anna Paulina Luna, R-Fla., is vowing to force a vote on Attorney General Merrick Garland's arrest if the Department of Justice (DOJ) fails to act on a criminal contempt resolution backed by House GOP leaders.    Luna told Fox News Digital on Tuesday evening that she would plan to force a vote on her "inherent contempt" resolution against Garland after the full House of Representatives weighs the Biden official's fate with a separate measure."As of right now, we fully intend to bring it," Luna revealed. "I don't really have much faith in the Department of Justice. And I don't think the American people do either. But we are trying to bring back a level playing field and show that, you kno

# Converting into dataframe

In [7]:
import pandas as pd
news_data = pd.DataFrame({
    "Title": news_titles,
    "Link": urls,
    "Corpus": art_text,
    "category": news_cat,
    "author": author_names,
    "timeanddate": time,
})
news_data.dropna(how='any', inplace=True)

# Cleaning corpus

In [8]:
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
# Setting up stopwords with additional words
stop_words = set(stopwords.words('english'))
stop_words.update(['close', 'fox','video','news', 'fox news', 'flash', 
                   'top','june','check','clicking', 'headlines', 
                   'also', 'march', 'get', 'one', 'said', 'new', 'two',
                     'would', "n't", 'first', 'images', 'according', 'years', 'like', 'told',
                     'notice', 'senior', 'huddle', 'accessibility', 'x', 'follow', 'includes', 'use',
                       'address', 'trouble', 'account', 'us', 'policy', 'free', 'incentive', 'content',
                         'click', 'continue', 'privacy', 'story', 'time', 'editor', 'charge', 'settings',
                           'special', 'financial', 'agreeing', 'please', 'getty', 'valid', 'select', 'let', 'gaydos',
                             'go', 'entering', 'terms', 'ryan', 'join', 'plus', 'email', 'fox', 'premium', 'subscribe',
                               'enter', 'news', 'articles',
                       'sports', 'know', 'pushing', 'newsletter', 'digital', 'via', 'coverage', 'access', 'app'])

nltk.download('punkt')
# Function to remove stop words and punctuation
def remove_stopwords_and_punctuation(text):
    if text is not None:  # Check for None type (better practice)
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word not in stop_words and word.isalpha()]  # Keep only alphanumeric tokens
        return ' '.join(filtered_tokens)

# Apply function to 'corpus' column
news_data['Corpus'] = news_data['Corpus'].apply(remove_stopwords_and_punctuation)

news_data["Corpus"][:5]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\achar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\achar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0    merrick garland trying blur lines legit doj cr...
1    historian called last presidential elections r...
2    reached maximum number log create reading jerr...
3    reached maximum number log create reading rese...
4    reds fan runs onto field backflip gets tased c...
Name: Corpus, dtype: object

# Cleaning author and timedate

In [9]:
import re

def clean_author(author):
    author = str(author).strip()
    author = re.sub(r'^By\s+', '', author, flags=re.IGNORECASE)  
    patterns_to_remove = [
        r'Fox news', r'FOXBusiness', r'Fox Weather', r'FOXWeather', r'FOX Weather',
        r'Source ReportFoxNews', r'Report Fox News', r'Sponsored by', r'Source FOXWeather', r'FOX Sports',
        r'Credible.*'  
    ]
    for pattern in patterns_to_remove:
        author = re.sub(pattern, '', author, flags=re.IGNORECASE)
    author = re.sub(r'CyberGuy Report|Source', '', author, flags=re.IGNORECASE) 
    return author.strip()


news_data['author'] = news_data['author'].apply(clean_author)


def clean_timeanddate(date_str):
    date_str = str(date_str).strip() 
    date_str = re.sub(r'\n', '', date_str)  
    date_str = re.sub(r'EST', 'EDT', date_str)  
    return date_str.strip()

news_data['timeanddate'] = news_data['timeanddate'].apply(clean_timeanddate)

# Check for any null values after cleaning and remove those rows
news_data.dropna(subset=['author', 'timeanddate'], inplace=True)



In [10]:
news_data

Unnamed: 0,Title,Link,Corpus,category,author,timeanddate
0,GOP lawmaker eyes backup plan for AG Garland's...,https://www.foxnews.com/politics/anna-paulina-...,merrick garland trying blur lines legit doj cr...,politics,Elizabeth Elkind,"June 12, 2024 10:10am EDT"
1,Historian with ace record calling elections sa...,https://www.foxnews.com/politics/who-has-keys-...,historian called last presidential elections r...,politics,Chris Pandolfo,"June 12, 2024 4:00am EDT"
2,NBA legend whose silhouette is famously repres...,https://www.foxnews.com/sports/nba-legend-jerr...,reached maximum number log create reading jerr...,sports,Ryan Gaydos,"June 12, 2024 9:52am EDT"
3,Researchers decode earliest known written reco...,https://www.foxnews.com/world/german-researche...,reached maximum number log create reading rese...,world,Anders Hagstrom,"June 12, 2024 7:27am EDT"
4,Moment MLB fan tased after running onto field ...,https://www.foxnews.com/sports/reds-fan-tased-...,reds fan runs onto field backflip gets tased c...,sports,Ryan Gaydos,"June 12, 2024 9:45am EDT"
...,...,...,...,...,...,...
164,Severe weather set to strike millions across M...,https://www.foxweather.com/weather-news/severe...,severe weather set strike millions across midw...,weather-news,Chris Oberholtz,"June 12, 2024 9:50am EDT"
169,EV charging cable thefts are on the rise in ye...,https://www.foxnews.com/us/ev-charging-cable-t...,biden admin force electric vehicle goals rober...,us,Associated Press,"June 12, 2024 10:21am EDT"
170,Port of Baltimore fully reopened after $100M c...,https://www.foxnews.com/us/port-baltimore-full...,ntsb drone footage shows aftermath francis sco...,us,Associated Press,"June 12, 2024 8:07am EDT"
171,41 confirmed dead after fire breaks out in Kuw...,https://www.foxnews.com/world/41-confirmed-dea...,shows suspected arsonist sets fire neighbor su...,world,Reuters,"June 12, 2024 8:03am EDT"


In [11]:
#sample code for reading writing
# news_data.to_csv('/work/fox_news.csv', index = 'False')
# news_data = pd.read_csv('fox_news.csv')

import pandas as pd
import os


# Load the existing CSV file if it exists
file_path = 'work\\fox_news.csv'
if os.path.exists(file_path):
    news_df = pd.read_csv(file_path)
else:
    
    news_data.to_csv(file_path, index = 'False')
    news_df = news_data




# Iterate over the new data and check for duplicates
for idx, new_data in news_data.iterrows():
    if new_data['Title'] not in news_df['Title'].values:
        # Append the new data
        news_df = pd.concat([news_df, pd.DataFrame([new_data])], ignore_index=True)
        print(f"New entry '{new_data['Title']}' added.")
    else:
        print(f"Title '{new_data['Title']}' already exists, skipping entry.")

# # Save the updated DataFrame back to the CSV file
# news_df.to_csv(file_path, index=False)

Title 'GOP lawmaker eyes backup plan for AG Garland's arrest if DOJ doesn't follow through' already exists, skipping entry.
Title 'Historian with ace record calling elections says if Trump or Biden has path to victory' already exists, skipping entry.
Title 'NBA legend whose silhouette is famously represented on the league's logo dead at 86' already exists, skipping entry.
Title 'Researchers decode earliest known written record of Jesus Christ's childhood' already exists, skipping entry.
Title 'Moment MLB fan tased after running onto field and doing backflip in front of officer' already exists, skipping entry.
Title 'Jet missing since 1971 finally found as investigators complete decades-old search' already exists, skipping entry.
Title 'Eight suspected terrorists with ISIS ties arrested in three major cities' already exists, skipping entry.
Title 'In-N-Out makes unavoidable decision after California boosts minimum wage' already exists, skipping entry.
Title '‘Worst mayor in America’ acc

In [15]:
# Save the updated DataFrame back to the CSV file
news_df.to_csv(file_path, index=False)

In [12]:
news_df

Unnamed: 0,Title,Link,Corpus,category,author,timeanddate
0,GOP lawmaker eyes backup plan for AG Garland's...,https://www.foxnews.com/politics/anna-paulina-...,merrick garland trying blur lines legit doj cr...,politics,Elizabeth Elkind,"June 12, 2024 10:10am EDT"
1,Historian with ace record calling elections sa...,https://www.foxnews.com/politics/who-has-keys-...,historian called last presidential elections r...,politics,Chris Pandolfo,"June 12, 2024 4:00am EDT"
2,NBA legend whose silhouette is famously repres...,https://www.foxnews.com/sports/nba-legend-jerr...,reached maximum number log create reading jerr...,sports,Ryan Gaydos,"June 12, 2024 9:52am EDT"
3,Researchers decode earliest known written reco...,https://www.foxnews.com/world/german-researche...,reached maximum number log create reading rese...,world,Anders Hagstrom,"June 12, 2024 7:27am EDT"
4,Moment MLB fan tased after running onto field ...,https://www.foxnews.com/sports/reds-fan-tased-...,reds fan runs onto field backflip gets tased c...,sports,Ryan Gaydos,"June 12, 2024 9:45am EDT"
...,...,...,...,...,...,...
164,Severe weather set to strike millions across M...,https://www.foxweather.com/weather-news/severe...,severe weather set strike millions across midw...,weather-news,Chris Oberholtz,"June 12, 2024 9:50am EDT"
169,EV charging cable thefts are on the rise in ye...,https://www.foxnews.com/us/ev-charging-cable-t...,biden admin force electric vehicle goals rober...,us,Associated Press,"June 12, 2024 10:21am EDT"
170,Port of Baltimore fully reopened after $100M c...,https://www.foxnews.com/us/port-baltimore-full...,ntsb drone footage shows aftermath francis sco...,us,Associated Press,"June 12, 2024 8:07am EDT"
171,41 confirmed dead after fire breaks out in Kuw...,https://www.foxnews.com/world/41-confirmed-dea...,shows suspected arsonist sets fire neighbor su...,world,Reuters,"June 12, 2024 8:03am EDT"


# Testing other tf-idf 

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [14]:
data = pd.read_csv('work\\fox_news.csv')

corpus_column = 'Corpus'

num_topics = 5
vectorizer = TfidfVectorizer(max_features=5000, stop_words=stopwords)
nmf_model = NMF(n_components=num_topics, random_state=42)
for category in data['category'].unique():
    print(f"Category: {category}")
    category_data = data[data['category'] == category]
    corpus = category_data[corpus_column].values
    tfidf_matrix = vectorizer.fit_transform(corpus)
    nmf_model.fit(tfidf_matrix)
    feature_names = vectorizer.get_feature_names_out()
    top_words_per_topic = []
    for topic_idx, topic in enumerate(nmf_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        top_words_per_topic.append(top_words)
    topic_names = [f"{category.capitalize()} Topic {i+1}" for i in range(num_topics)]
    for i, top_words in enumerate(top_words_per_topic):
        print(f"{topic_names[i]}: {', '.join(top_words)}")
    print()


Category: politics


TypeError: 'WordListCorpusReader' object is not iterable

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b5372552-bc21-445b-8f65-92a12c531f6c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>