### Scraping News Data From https://www.noaa.gov/

In [1]:
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

class DataHarvester(ABC):
    
    @abstractmethod
    def get_news_feed(self):
        pass
    

class ExtractData(DataHarvester):
    def __init__(self, keywords=None):
        if not keywords:
            self.keywords = None
        else:
            self.keywords = keywords
    
    def get_news_feed(self):
        l_data = []
        if self.keywords == None:
            for i in range(20):
                url = "https://www.noaa.gov/content/Weather+Climate+Ocean-&-Coasts+Fisheries+Satellites+Research+Marine-&-Aviation+Charting+Across-NOAA+Sanctuaries/news?page="+str(i)
                print("working on :")
                print(url)
                response = requests.get(url)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    data = self.extract_data(soup)
                    
                    if data.empty == True:
                        print("No more data available....")
                        break
                    else:
                        l_data.append(data)
                        print("Data Collected!...")
                else:
                    print("Problem with Connection.....")
                    print("code : ", response.status_code)
                    break
            return l_data
        else:
            
            for key_w in self.keywords:
                key_w = key_w.replace(' ', '-')
                for i in range(20):
                    url = f"https://www.noaa.gov/topic-tags/{key_w}?page={i}"
                    print("working on :")
                    print(url)
                    response = requests.get(url)
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.text, 'html.parser')
                        data = self.extract_data(soup)

                        if data.empty == True:
                            print("No more data available....")
                            break
                        else:
                            l_data.append(data)
                            print("Data Collected!...")
                    else:
                        print("Problem with Connection.....")
                        print("code : ", response.status_code)
                        break
            return l_data
    
        
    def extract_data(self, soup=None):
        my_data = []
        
        for article in soup.findAll("div", {"class": "content-wrapper"}):
            my_dict = {}
            
            if article.a.text is None:
                name = np.nan
            else:
                if article.a.text.strip():
                    name = article.a.text
                else:
                    name = np.nan
                
            if article.time is None:
                date = np.nan
            else:
                if article.time.text.strip():
                    date = article.time.text
                else:
                    date = np.nan
                
            for art in article.findAll("div", {"class": "tags-wrapper"}):
                
                if not art.findAll("div", {"class": "tags focus-areas"}):
                    f_name = 'Focus areas:'
                    f_data = ['missing']
                else:
                    for item in art.findAll("div", {"class": "tags focus-areas"}):
                        f_name = item.span.text
                        for i in item.findAll("div", {"class": "field__items"}):
                            f_data = [x.text for x in i.find_all('a')]
                            
                if not art.findAll("div", {"class": "tags topic-tags"}):
                    t_name = 'Topics:'
                    t_data = ['missing']
                else:
                    for item in art.findAll("div", {"class": "tags topic-tags"}):
                        t_name = item.span.text
                        for k in item.findAll("div", {"class": "field__items"}):
                            t_data = [x.text for x in k.find_all('a')]
                            
            my_dict['name'] = name
            my_dict['date'] = date
            my_dict[f_name] = ','.join(f_data)
            my_dict[t_name] = ','.join(t_data)
            my_data.append(my_dict)
        
        df = pd.DataFrame(my_data)
        return df

### Extracting data without keyword

In [2]:
obj = ExtractData()
data = obj.get_news_feed()
if not data:
    print("No data")
else:
    df = pd.concat(data, ignore_index=True)
    df.to_csv('./dataset_without_kw.csv', sep=',')

working on :
https://www.noaa.gov/content/Weather+Climate+Ocean-&-Coasts+Fisheries+Satellites+Research+Marine-&-Aviation+Charting+Across-NOAA+Sanctuaries/news?page=0
Data Collected!...
working on :
https://www.noaa.gov/content/Weather+Climate+Ocean-&-Coasts+Fisheries+Satellites+Research+Marine-&-Aviation+Charting+Across-NOAA+Sanctuaries/news?page=1
Data Collected!...
working on :
https://www.noaa.gov/content/Weather+Climate+Ocean-&-Coasts+Fisheries+Satellites+Research+Marine-&-Aviation+Charting+Across-NOAA+Sanctuaries/news?page=2
Data Collected!...
working on :
https://www.noaa.gov/content/Weather+Climate+Ocean-&-Coasts+Fisheries+Satellites+Research+Marine-&-Aviation+Charting+Across-NOAA+Sanctuaries/news?page=3
Data Collected!...
working on :
https://www.noaa.gov/content/Weather+Climate+Ocean-&-Coasts+Fisheries+Satellites+Research+Marine-&-Aviation+Charting+Across-NOAA+Sanctuaries/news?page=4
Data Collected!...
working on :
https://www.noaa.gov/content/Weather+Climate+Ocean-&-Coasts+Fi

### Extracting data with keyword

In [11]:
keywords = ['weather', 'research', 'ocean', 'climate', 'holidays', 'historic climate record',
            'winter', 'fall', 'forecasts', 'Weather-ready Nation', 'NOAA 50th Anniversary',
           'heritage', 'Oral histories', 'forecasting', 'science and technology', 'research cruise',
            'climate analyses and statistics', 'wildfires', 'earth science']

obj = ExtractData(keywords)
data = obj.get_news_feed()
if not data:
    print("No data")
else:
    kw_df = pd.concat(data, ignore_index=True)
    kw_df.to_csv('./dataset_with_kw.csv', sep=',')

working on :
https://www.noaa.gov/topic-tags/weather?page=0
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/weather?page=1
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/weather?page=2
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/weather?page=3
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/weather?page=4
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/weather?page=5
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/weather?page=6
No more data available....
working on :
https://www.noaa.gov/topic-tags/research?page=0
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/research?page=1
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/research?page=2
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/research?page=3
Data Collected!...
working on :
https://www.noaa.gov/topic-tags/research?page=4
Data Collected!...
working on :
https://www.noaa.gov/topic

In [12]:
kw_df

Unnamed: 0,name,date,Focus areas:,Topics:
0,Raytheon Intelligence and Space to lead new ce...,"April 26, 2021",Research,weather
1,"NOAA report highlights 2020 climate, weather, ...","April 14, 2021",Research,"research,ocean,weather,climate"
2,Weather education for a Weather-Ready Nation,"December 22, 2020",Education,weather
3,What are your chances for a white Christmas th...,"December 7, 2020","Climate,Weather","holidays,weather,historic climate record,winter"
4,Thanksgiving Weekend forecast: Rain and snowfa...,"November 24, 2020",Weather,"fall,holidays,weather,forecasts"
...,...,...,...,...
716,Science in service to the planet we call home,"April 16, 2018",Across NOAA,"Earth Day,earth science,research"
717,NOAA Science Report highlights 2017 research a...,"March 9, 2018","Across NOAA,Research","research,earth science"
718,Science is kid's play: Online games for school...,"December 22, 2017","Across NOAA,Education","students,education,earth science,games"
719,National Science Teaching Association Conferen...,,missing,missing


### List of Keywords for Search

In [None]:
['weather', 'research', 'ocean', 'climate', 'holidays', 'historic climate record', 'winter', 'fall', 'forecasts',
 'Weather-ready Nation', 'NOAA 50th Anniversary', 'heritage', 'Oral histories', 'forecasting', 'science and technology',
 'research cruise', 'climate analyses and statistics', 'wildfires', 'hurricanes', 'preparedness', 'spring',
 'flooding', 'weather safety', 'lightning', 'technology & innovation', 'fisheries', 'geostationary satellite (GOES)', 'water use',
 'AGU meeting', 'extreme events', 'climate change', 'social science research', 'agriculture & farming', 'unmanned aircraft', 'atmosphere',
 'summer', 'climate outlooks', 'infographic', 'missing', 'modeling', 'Antarctica', 'South Pole', 'severe weather', 'tornadoes', 'water',
 'weather conditions', 'students', 'space weather', 'satellites', 'El Nino', 'La Nina', 'ENSO', 'greenhouse gases', 'coral', 'Great Lakes',
 'tsunamis', 'coral reefs', 'marine life', 'uncrewed systems', 'unmanned systems', 'podcast', 'unmanned vehicles', 'sea turtles', 'conservation',
 'air quality', 'COVID-19', 'human health', 'ocean exploration', 'strategic plans', 'climate system', 'Arctic', 'sea ice', 'unmanned underwater vehicles',
 'fish', 'ozone', 'Sea Grant', 'grants', 'Shark Week', 'heat island', 'heat waves', 'citizen science', 'water quality', 'NOAA ships',
 'measuring and modeling climate', 'Earth Day', 'earth science', 'Earth observations', 'whales', 'climate data', 'climate science',
 'global warming', 'extreme weather', 'history', 'marine mammals', 'strandings', 'hurricane hunters',
 'endangered species', 'surveys', 'aerial imagery', 'carbon capture and storage', 'La Nina', 'ENSO', 'safety', 'beach safety', 'wind energy',
 'charts and maps', 'partnerships', 'nautical charts and maps', 'satellite data', 'ocean facts', 'World Ocean Day', 'harmful algal blooms (HABs)',
 'economic data', 'carbon dioxide', 'published research', 'measurements and observations', 'coasts', 'Story maps', 'tides and currents', 'marine debris',
 'coastal economies', 'Office of Education', 'La Nina', 'ENSO', 'Climate Normals', 'temperature rankings', 'precipitation', 'paleoclimatology', 'drought',
 'global average temperatures', 'ocean acidification', 'seafood', 'winter storm']

In [6]:
data = kw_df['Topics:'].str.split(',')
chk = []
for i in data:
    for j in i:
        if j not in chk:
            chk.append(j.strip())           
print(chk)

['weather', 'research', 'ocean', 'climate', 'holidays', 'historic climate record', 'winter', 'fall', 'forecasts', 'Weather-ready Nation', 'NOAA 50th Anniversary', 'heritage', 'Oral histories', 'forecasting', 'science and technology', 'research cruise', 'climate analyses and statistics', 'wildfires', 'education', 'hurricanes', 'preparedness', 'spring', 'flooding', 'weather safety', 'lightning', 'technology & innovation', 'fisheries', 'geostationary satellite (GOES)', 'water use', 'AGU meeting', 'extreme events', 'climate change', 'social science research', 'agriculture & farming', 'unmanned aircraft', 'atmosphere', 'summer', 'climate outlooks', 'infographic', 'missing', 'modeling', 'Antarctica', 'South Pole', 'severe weather', 'tornadoes', 'water', 'weather conditions', 'students', 'space weather', 'satellites', 'El Nino', 'La Nina', 'ENSO', 'greenhouse gases', 'coral', 'Great Lakes', 'tsunamis', 'coral reefs', 'marine life', 'uncrewed systems', 'unmanned systems', 'podcast', 'unmanned 