### Import necessary libraries

In [10]:
from bs4 import BeautifulSoup
import requests
import re

###  Set url to scrap

In [11]:
url = requests.get('https://cimsec.org/lead-the-fight-against-climate-change-and-transnational-crime-in-the-indian-ocean/')
soup = BeautifulSoup(url.text, "html.parser")

### Scrap the content and data preprocessing

In [14]:
def clean_text(text):
    # Text to lowercase
    text = text.lower()
    # Remove special characters using regular expression
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

def text_to_word(soup):
    # this can be different for each link, because sites may use different html structure/classes
    content = soup.find_all("div", class_ = 'entry-content')
    text_wordlist = []
    for paragraph in content:
        text = paragraph.get_text(separator='\n')
        text = clean_text(text)
#        print(text.find('island states'))
        text_wordlist += text.split()
    return text_wordlist

text_wordlist = text_to_word(soup)

print(text_wordlist)

['notes', 'to', 'the', 'new', 'cno', 'series', 'by', 'commander', 'amila', 'prasanga', 'sri', 'lankan', 'navy', 'there', 'are', 'vital', 'indian', 'ocean', 'insights', 'regarding', 'island', 'states', 'strategic', 'vulnerabilities', 'related', 'to', 'transnational', 'crime', 'caused', 'by', 'climate', 'change', 'and', 'the', 'usefulness', 'of', 'us', 'naval', 'operations', 'that', 'merit', 'the', 'next', 'cnos', 'chief', 'of', 'naval', 'operations', 'attention', 'these', 'insights', 'align', 'with', 'the', 'commitment', 'to', 'ensuring', 'the', 'us', 'navy', 'remains', 'the', 'preeminent', 'global', 'fighting', 'force', 'and', 'a', 'trusted', 'defender', 'of', 'rulesbased', 'order', 'island', 'states', 'in', 'the', 'indian', 'ocean', 'region', 'face', 'unique', 'geopolitical', 'and', 'environmental', 'challenges', 'their', 'limited', 'landmass', 'vulnerability', 'to', 'rising', 'sea', 'levels', 'and', 'dependence', 'on', 'maritime', 'resources', 'create', 'a', 'delicate', 'equilibrium'

### Count the frequency of each word appeared in the text

In [15]:
from collections import Counter
word_count = Counter(text_wordlist)

### Using pandas to form a dataframe

In [16]:
import pandas as pd

In [17]:
df = pd.DataFrame(word_count.items(), columns = ['word', 'count'])

df.head()

Unnamed: 0,word,count
0,notes,1
1,to,16
2,the,32
3,new,1
4,cno,2


####
#### I searched for "fish crime", "ocean crime", "fish crime ship", etc on Google. 
#### Sometimes, the results contain some irrelevant articles such as "stolen fish from restaurants".
#### My first thought on an algorithm detecting relevant information is to use the naive Bayes model like a spam-email detector.
####
#### A technical question: Can we automatically collect the top 20 results from a search engine, instead of doing it one-by-one like my code?
#### If we can do, that can save us a lot of time!
####


In [None]:
# For the sesarch engines...Here are some examples just to start with:

# Google News: https://news.google.com/home?hl=en-CA&gl=CA&ceid=CA:en
# Yahoo News: https://ca.news.yahoo.com/

#### North America
# Global News: https://globalnews.ca/
# CNN: https://www.cnn.com/
# CBC News: https://www.cbc.ca/news
# NBC News: https://www.nbcnews.com/
# Maritime Crimes: https://maritimescrimes.com/

#### Europe
# BBC News: https://www.bbc.com/news

#### South Korea (These were easy to find, just because I'm Korean haha)
# Korea Herald: https://www.koreaherald.com/
# Korea Times: https://www.koreatimes.co.kr/