### Web data retrieval

In [0]:
import requests


import matplotlib.pyplot as plt
import os

%matplotlib inline

In [0]:
url = 'https://inshorts.com/en/read/technology'

In [0]:
news_data = []

In [4]:
news_category = url.split('/')[-1]
news_category

'technology'

Download HTML data

In [5]:
data = requests.get(url)
data.content

b'<!doctype html>\n<html lang="en">\n\n<head>\n  <meta charset="utf-8" />\n  <style>\n    /* The Modal (background) */\n    .modal_contact {\n        display: none; /* Hidden by default */\n        position: fixed; /* Stay in place */\n        z-index: 8; /* Sit on top */\n        left: 0;\n        top: 0;\n        width: 100%; /* Full width */\n        height: 100%;\n        overflow: auto; /* Enable scroll if needed */\n        background-color: rgb(0,0,0); /* Fallback color */\n        background-color: rgba(0,0,0,0.4); /* Black w/ opacity */\n    }\n\n    /* Modal Content/Box */\n    .modal-content {\n        background-color: #fefefe;\n        margin: 15% auto;\n        padding: 20px !important;\n        padding-top: 0 !important;\n        /* border: 1px solid #888; */\n        text-align: center;\n        position: relative;\n        border-radius: 6px;\n    }\n\n    /* The Close Button */\n    .close {\n      left: 90%;\n      color: #aaa;\n      float: right;\n      font-size: 

### Data Cleaning

In [0]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data.content, 'html.parser')

Read all the articles. For each article, we will read:

1. Headline
2. Article body
3. Category

This is done by reading text between specific HTML tags. The tags depend on actual web page

In [0]:
news_articles = [{'news_headline': headline.find('span', attrs={'itemprop': 'headline'}).string,
                  'news_article': article.find('div', attrs={'itemprop': 'articleBody'}).string,
                  'news_category': news_category} 
                 for headline, article in zip(soup.find_all('div', 
                                                            class_ = ['news-card-title news-right-box']), 
                                              soup.find_all('div', class_=['news-card-content news-right-box']))]

In [8]:
#Check news data
news_data.extend(news_articles)
news_data

[{'news_article': 'In a post on OnePlus forums, CEO Pete Lau announced that the OnePlus TV will be launched this September in India. "Our goal is for OnePlus to set the standard for future smart TV products, because we focus on every last detail," added Pete Lau. Users can now register on the "Notify Me" page on Amazon to get updates.',
  'news_category': 'technology',
  'news_headline': 'OnePlus TV set to launch in September, India to get it first'},
 {'news_article': 'Facebook\'s contractor in India for content moderation reviewers, technology outsourcing company Genpact, has raised the minimum wages offered to the reviewers. This comes months after the reviewers had described their work as "underpaid", "stressful" and sometimes "traumatic". However, the recent hikes were reportedly applied only to new recruits and were not applicable to existing workers.',
  'news_category': 'technology',
  'news_headline': 'Facebook India content reviewers get hike after low pay complain'},
 {'news

In [0]:
#Building dataframe
import pandas as pd
df = pd.DataFrame(news_data, columns=['news_headline', 'news_article', 'news_category'])

In [10]:
df.head()

Unnamed: 0,news_headline,news_article,news_category
0,"OnePlus TV set to launch in September, India t...","In a post on OnePlus forums, CEO Pete Lau anno...",technology
1,Facebook India content reviewers get hike afte...,Facebook's contractor in India for content mod...,technology
2,PewDiePie marries girlfriend after 8 years of ...,"Felix Kjellberg, also known as PewDiePie, the ...",technology
3,Microsoft contractors hear chats mistakenly re...,Third-party Microsoft contractors listened to ...,technology
4,Twitter faces outage across several countries ...,Twitter on Wednesday faced outage across sever...,technology


### Function to extract data from inshorts.com

The function will:

1. take a URLs list as input
2. Get content for each URL
3. Extract news article headline, body and category

In [0]:
urls_list = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

In [0]:
def datasetPrepare(urls_list):
    
    news_data = []
    for url in urls_list:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        news_articles = [{'news_headline': headline.find('span', attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles) 
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df    

In [0]:
news_df = datasetPrepare(urls_list)

In [14]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 3 columns):
news_headline    74 non-null object
news_article     74 non-null object
news_category    74 non-null object
dtypes: object(3)
memory usage: 1.8+ KB


In [15]:
news_df.head()

Unnamed: 0,news_headline,news_article,news_category
0,"OnePlus TV set to launch in September, India t...","In a post on OnePlus forums, CEO Pete Lau anno...",technology
1,Facebook India content reviewers get hike afte...,Facebook's contractor in India for content mod...,technology
2,PewDiePie marries girlfriend after 8 years of ...,"Felix Kjellberg, also known as PewDiePie, the ...",technology
3,Microsoft contractors hear chats mistakenly re...,Third-party Microsoft contractors listened to ...,technology
4,Twitter faces outage across several countries ...,Twitter on Wednesday faced outage across sever...,technology


In [16]:
#Articles count by category
news_df.news_category.value_counts()

technology    25
sports        25
world         24
Name: news_category, dtype: int64

### Text Wrangling and Pre-processing

In [0]:
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
import unicodedata

In [0]:
nlp = spacy.load('en')

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

#Removing a word
stopword_list.remove('no')
stopword_list.remove('not')

## Remove HTML tags

In [21]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

'Some important text'

## Remove accented characters

In [22]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

## Remove special characters

In [0]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [24]:
remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits=True)

'Well this was fun What do you think '

## Text lemmatization

In [0]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [26]:
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crashed yesterday , ours crash daily'

## Text stemming

In [27]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

## Remove stopwords

In [28]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

## Building a text normalizer

In [0]:
def normalize_corpus(corpus, html_stripping=True, accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

## Pre-process and normalize news articles

In [0]:
news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]

In [31]:
news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])
news_df.iloc[1][['full_text', 'clean_text']].to_dict()

{'clean_text': 'facebook india content reviewer get hike low pay complain facebook contractor india content moderation reviewer technology outsource company genpact raise minimum wage offer reviewer come month reviewer describe work underpaid stressful sometimes traumatic however recent hike reportedly apply new recruit not applicable exist worker',
 'full_text': 'Facebook India content reviewers get hike after low pay complain. Facebook\'s contractor in India for content moderation reviewers, technology outsourcing company Genpact, has raised the minimum wages offered to the reviewers. This comes months after the reviewers had described their work as "underpaid", "stressful" and sometimes "traumatic". However, the recent hikes were reportedly applied only to new recruits and were not applicable to existing workers.'}

# Save the news articles

In [0]:
news_df.to_csv('news.csv', index=False, encoding='utf-8')

# Tagging Parts of Speech

In [0]:
news_df = pd.read_csv('news.csv')

In [0]:
corpus = normalize_corpus(news_df['full_text'], text_lower_case=False, 
                          text_lemmatization=False, special_char_removal=False)

sentence = str(news_df.iloc[1].news_headline)
sentence_nlp = nlp(sentence)

In [35]:
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])

Unnamed: 0,Word,POS tag,Tag type
0,Facebook,NNP,PROPN
1,India,NNP,PROPN
2,content,NN,NOUN
3,reviewers,NNS,NOUN
4,get,VBP,VERB
5,hike,NN,NOUN
6,after,IN,ADP
7,low,JJ,ADJ
8,pay,NN,NOUN
9,complain,NN,NOUN


In [0]:
from spacy import displacy

In [37]:
displacy.render(sentence_nlp, style="dep", jupyter=True)

# Named Entity Recognition

In [0]:
sentence = str(news_df.iloc[1].full_text)

In [0]:
sentence_nlp = nlp(sentence)

In [40]:
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

[(India, 'GPE'), (India, 'GPE'), (Genpact, 'PERSON'), (months, 'DATE')]


In [41]:
displacy.render(sentence_nlp, style='ent', jupyter=True)