In [1]:
!pip install -U spacy
!python -m spacy download en_core_web_md

Requirement already up-to-date: spacy in /usr/local/lib/python3.6/dist-packages (2.3.2)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [2]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from collections import Counter
import string

import spacy
import en_core_web_md
nlp = en_core_web_md.load(parse=True, tag=True, entity=True)

import nltk
nltk.download('stopwords')
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stop_words = set(stopword_list)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
demo_sites = ['https://inshorts.com/en/read/science',
              'https://inshorts.com/en/read/world',
              'https://inshorts.com/en/read/technology']

In [4]:
def generate_dataset(demo_sites):
    news_data = []

    for url in demo_sites:
        category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')       
        news_articles = [{'headline': headline.find('span', attrs={"itemprop": "headline"}).string,
                          'article': article.find('div', attrs={"itemprop": "articleBody"}).string}
                           for headline, article in 
                             zip(soup.find_all('div', class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['headline', 'article']]
    return df

In [5]:
news_df = generate_dataset(demo_sites)
news_df.head(10)

Unnamed: 0,headline,article
0,Doctors find roundworm in Japanese woman's ton...,After a 25-year-old woman in Tokyo visited the...
1,"Moderna COVID-19 vaccine safe, induces immune ...",Moderna's experimental COVID-19 vaccine showed...
2,Moderna to start final-stage COVID-19 vaccine ...,Moderna on Tuesday said it plans to start its ...
3,Researchers make medical wearable using a penc...,The University of Missouri researchers have us...
4,SpaceX seeks to fly its prototype Mars rocket ...,Elon Musk's SpaceX is looking to fly its 'Star...
5,UAE postpones launch of Mars orbiter citing we...,The UAE has postponed the launch of its Mars m...
6,NASA delays launch of $10B James Webb Space Te...,NASA has again delayed the launch of its next-...
7,"No, we did not change the zodiac: NASA after r...",After reports of NASA adding a new zodiac sign...
8,Metal-eating bacteria discovered as scientist ...,Microbiologists from California Institute of T...
9,Scientists recover 'severely' damaged human lu...,"Researchers have recovered donated lungs, deem..."


In [6]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    text_strip = soup.get_text()
    return text_strip

def remove_special_characters(text, remove_digit=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digit else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

def stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatizer(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered = [token for token in tokens if not token in stop_words]
    filtered_text = ' '.join(filtered)
    return filtered_text


In [7]:
def normalize_corpus(corpus, strip_html=True, text_lower_case=True,
                     lemmatization=True, remove_special_chars=True,
                     remove_stopword=True, remove_digits=True):

    normalized_text = []
    for text in corpus:
        # strip HTML
        if strip_html:
           text = strip_html_tags(text)
        # lowercase the text
        if text_lower_case:
           text = text.lower()
        # remove extra newlines
        text = re.sub(r'[\r|\n|\r\n]+', ' ', text)
        # lemmatize text
        if lemmatization:
            text = lemmatizer(text)
        # remove special characters and\or digits    
        if remove_special_chars:
            # isolate special characters by inserting spaces    
            pattern = re.compile(r'([{.(-)!}])')
            text = pattern.sub(" \\1 ", text)
            text = remove_special_characters(text, remove_digit=remove_digits)
        # remove extra whitespace
        text = re.sub(' +', ' ', text)
        # remove stopwords
        if remove_stopword:
           text = remove_stopwords(text)
        
        normalized_text.append(text)

    return normalized_text

In [8]:
# combining headline and article text
news_df['complete_text'] = news_df["headline"].map(str)+ '. ' + news_df["article"]

# create a basic pre-processed corpus
corpus = normalize_corpus(news_df['complete_text'], remove_digits=False)

# Selecting a sample news article
sentence = str(corpus[0:25])
sentence_nlp = nlp(sentence)

# POS tagging with Spacy 
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type']).head(10)

Unnamed: 0,Word,POS tag,Tag type
0,[,-LRB-,PUNCT
1,',``,PUNCT
2,doctor,NN,NOUN
3,find,VB,VERB
4,roundworm,JJ,ADJ
5,japanese,JJ,ADJ
6,woman,NN,NOUN
7,tonsil,NN,NOUN
8,eat,VBP,VERB
9,sashimi,NN,NOUN


In [9]:
c = Counter(([word.pos_ for word in sentence_nlp]))
sbase = sum(c.values())
for el, cnt in c.items():
    print(el, cnt)

PUNCT 77
NOUN 470
VERB 185
ADJ 140
PROPN 232
ADP 11
NUM 96
ADV 40
X 2
PART 4
CCONJ 2
PRON 4
INTJ 2
SCONJ 3


In [10]:
Noun = []
Pronoun = []
Verb = []
Adverb = []
Conjunction = []
Interjection = []
Adjective = []
Preposition = []

for word in sentence_nlp:
    if word.pos_ == "NOUN":
        Noun.append(word)
    elif word.pos_ == "PRON":
        Pronoun.append(word)
    elif word.pos_ == "VERB":
        Verb.append(word)
    elif word.pos_ == "ADV":
        Adverb.append(word)
    elif word.pos_ == "INTJ":
        Interjection.append(word)
    elif word.pos_ == "ADJ":
        Adjective.append(word)
    elif word.pos_ == "ADP":
        Preposition.append(word)
    elif word.pos_ == "SCONJ":
        Conjunction.append(word)

data = [['Noun', Noun, len(Noun)], ['Pronoun', Pronoun, len(Pronoun)],
        ['Verb', Verb, len(Verb)], ['Adverb', Adverb, len(Adverb)],
        ['Conjunction', Conjunction, len(Conjunction)],
        ['Interjection', Interjection, len(Interjection)],
        ['Adjective', Adjective, len(Adjective)],['Preposition', Preposition, len(Preposition)]]

final_df = pd.DataFrame(data, columns = ['Part of Speech', 'Words', 'Word_Count'])
final_df

Unnamed: 0,Part of Speech,Words,Word_Count
0,Noun,"[doctor, woman, tonsil, sashimi, pain, throat,...",470
1,Pronoun,"[us, someone, us, mine]",4
2,Verb,"[find, eat, 25year, visit, find, consume, shed...",185
3,Adverb,"[early, almost, next, eventually, previously, ...",40
4,Conjunction,"[upon, whether, near]",3
5,Interjection,"[no, see]",2
6,Adjective,"[roundworm, japanese, old, uncommon, assorted,...",140
7,Preposition,"[inside, bioelectronic, aboard, journey, aroun...",11
