## This is based on the blogpost for understanding NLP blog post 

https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

pd.set_option("display.max.rows", 99)
pd.set_option("display.max.columns", 99)


### Create a method to get the news arcticles and parse them

In [2]:
seed_url = ['https://inshorts.com/en/read/technology',
           'https://inshorts.com/en/read/sports',
           'https://inshorts.com/en/read/world']

def build_dataset(seed_url):
    news_data = []
    for link in seed_url:
        news_category = link.split("/")[-1]
#         print(news_category)
        data = requests.get(link)
        cleaned_data = BeautifulSoup(data.content, 'html.parser')
    
        # Parse the HTML to create structured content
        news_articles = [{
            'news_headline': headline.find('span', attrs={"itemprop": "headline"}).string, 'news_article': article.find('div', attrs={"itemprop": "articleBody"}).string,'news_category': news_category}                         
            for headline, article in zip(cleaned_data.find_all('div', class_=["news-card-title"]), cleaned_data.find_all('div', class_=["news-card-content"]))                        ]
        news_data.extend(news_articles)
        
    
    df = pd.DataFrame(news_data, columns=['news_headline', 'news_article', 'news_category'])
    return df

### Call the build_dataset to create the dataframe 

In [3]:
news_df = build_dataset(seed_url)
news_df.head(n=10)

Unnamed: 0,news_headline,news_article,news_category
0,What's the story behind egg pic that broke Kyl...,Instagram account that broke Kylie Jenner's re...,technology
1,Facebook Messenger now lets users unsend mess...,"Facebook Messenger has globally rolled out ""Re...",technology
2,Even my kids said it was cringey: YouTube CEO ...,YouTube CEO Susan Wojcicki has revealed even h...,technology
3,I hated being only woman in the room: Randi Zu...,Facebook Co-founder Mark Zuckerberg's elder si...,technology
4,I'm going to shoot up a school: US boy tells S...,"A middle school student in Indiana, US got a l...",technology
5,Supermodel Tyra Banks to open tech-based attra...,American supermodel-turned-entrepreneur Tyra B...,technology
6,WhatsApp to ban political parties if misused a...,WhatsApp's Communications head Carl Woog warne...,technology
7,Facebook agrees to share user chat details wit...,Facebook has agreed to share 'Messenger' chat ...,technology
8,Flickr deletes photos of free accounts going o...,Photo and video-hosting platform Flickr on Tue...,technology
9,Reddit to raise up to $300mn at about $3bn val...,Online discussions platform Reddit may raise $...,technology


In [4]:
# news_df.news_category.value_counts(normalize=True)
news_df.news_category.value_counts()

sports        25
world         25
technology    24
Name: news_category, dtype: int64

In [5]:
# get the necessary nltk tokenizers and unicodedata and contraction maps
import nltk
import re
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
import unicodedata
from contractions import CONTRACTION_MAP


In [6]:
# remove stopwods from the nltk english corpus

nlp = spacy.load('en', parse = True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopwords = nltk.corpus.stopwords.words('english')
stopwords.remove('no')
stopwords.remove('not')

### The text pre-processing functions

In [7]:
def clean_html(data):
    text = BeautifulSoup(data, "html.parser")
    return text.get_text()

def remove_accented_characters(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

# This method is for expanding contractions, read up on how the regex and match works
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def remove_special_chars(text, delete_num=False):
    compile_pattern = r'[^A-Za-z0-9\s]' if not delete_num else r'[^A-Za-z\s]'
    text = re.sub(compile_pattern, "",text)
    return text

def simple_stemmer(text):
    nps = nltk.stem.PorterStemmer()
    return " ".join([nps.stem(word) for word in text.split()])

def simple_lemmatization(text):
    words = nlp(text)
    lemma_words = [word.lemma_ if word.lemma_!="-PRON-" else word.text for word in words]
    return " ".join([word for word in lemma_words])

def remove_stopwords(text, to_lower=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    final_words = []
    if to_lower:
        final_words = [token for token in tokens if token not in stopwords ]
    else:
        final_words = [token for token in tokens if token.lower() not in stopwords ]
        
    final_words_ns = " ".join(final_words)
    return final_words_ns

### Now combine all the methods into the pipeline

In [8]:
def text_preprocessing(corpus, html_cleaner=True, remove_acccented_chars = True, contractions_expansion=True,
                      rmv_spl_chars = True, to_lower_case=True, smpl_stemmer=True, smpl_lemmatization=True,
                      rmv_stopwords=True):
    normalized_corpus = []
    for doc in corpus:
#         print(doc)
        # clean html tags
        if html_cleaner:
            doc = clean_html(doc)
        # remove accented characters
        if remove_acccented_chars:
            doc = remove_accented_characters(doc)
        # expand the contractions
        if contractions_expansion:
            doc = expand_contractions(doc)
        # remove special characters
        if rmv_spl_chars:
            doc = remove_special_chars(doc)
        # converting to lower case
        if to_lower_case:
            doc = doc.lower()
        # apply simple stemming
        if smpl_stemmer:
            doc = simple_stemmer(doc)
        # apply lemmatization
        if smpl_lemmatization:
            doc = simple_lemmatization(doc)
        # apply stopwords removal
        if rmv_stopwords:
            doc = remove_stopwords(doc)
        normalized_corpus.append(doc)        
    return normalized_corpus
        
    
    

In [9]:
# call the preprocessing function on the dataframe
news_df['full_text'] = news_df['news_headline']+". "+news_df['news_article']
news_df['clean_text'] = text_preprocessing(news_df['full_text'])
# check a sample as dictionary for clean and full text
news_df[["full_text", "clean_text"]].iloc[7].to_dict()

# save the news_df to the csv file
news_df.to_csv('news.csv', index=False, encoding='utf-8')

### Understanding Language and Structure

In [24]:
sentence = str(news_df.iloc[7].news_headline)

# Run POS Tagging using spacy
# %timeit sentence_nlp = nlp(sentence)  # For timing the pos tag operation using spacy
sentence_nlp = nlp(sentence)
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])

Unnamed: 0,Word,POS tag,Tag type
0,Facebook,NNP,PROPN
1,agrees,VBZ,VERB
2,to,TO,PART
3,share,VB,VERB
4,user,NN,NOUN
5,chat,NN,NOUN
6,details,NNS,NOUN
7,with,IN,ADP
8,Delhi,NNP,PROPN
9,Police,NNP,PROPN


In [25]:
# Run the pos tagging using NLTK
# %timeit nltk_pos_tagged = nltk.pos_tag(sentence.split()) # For timing the pos tag operation using spacy
nltk_pos_tagged = nltk.pos_tag(sentence.split())
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])


Unnamed: 0,Word,POS tag
0,Facebook,NNP
1,agrees,VBZ
2,to,TO
3,share,NN
4,user,JJ
5,chat,NN
6,details,NNS
7,with,IN
8,Delhi,NNP
9,Police,NNP


In [26]:
sent = "The quick brown fox jumped over the lazy dog"
sent_nlp = nlp(sent)
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sent_nlp]
spacy_pos_tagged

[(The, 'DT', 'DET'),
 (quick, 'JJ', 'ADJ'),
 (brown, 'JJ', 'ADJ'),
 (fox, 'NN', 'NOUN'),
 (jumped, 'VBD', 'VERB'),
 (over, 'IN', 'ADP'),
 (the, 'DT', 'DET'),
 (lazy, 'JJ', 'ADJ'),
 (dog, 'NN', 'NOUN')]

In [6]:
import re
from nltk.util import ngrams

s = "Abhik is testing something"
s = s.lower()
tokens = [token for token in s.split(" ") if token != ""]
print(tokens)
output = list(ngrams(tokens, 2))
print(output)

['abhik', 'is', 'testing', 'something']
[('abhik', 'is'), ('is', 'testing'), ('testing', 'something')]


In [11]:
pt = ["".join([word[0]," ",word[1]]) for word in output]
pt

['abhik is', 'is testing', 'testing something']

### Call the functions to test each functionality

In [None]:
# check the clean_html function
cleaned_html = clean_html("<html><head>Hello how are you. </head><p>I am here to check the text</p></html>")

# check the remove accented characters function
cleaned_accented = remove_accented_characters('Sómě Áccěntěd těxt')
print(cleaned_accented)

# check the expand contractions
print(expand_contractions("Y'all can't expand contractions I'd think"))


# remove the numbers from text
print(remove_special_chars("Hello! How are you ? #865875"))
# print(remove_special_chars("Hello! How are you ? #865875", True))

# call the porter stemmer
text = "Hello! How are you running jumping sleeping"
print(simple_stemmer(text))

# call the simple Lemmatizer
print(simple_lemmatization(text))

# check the remove_stopwords function
print(remove_stopwords("I am going to go to market to eat some lunch"))