In [1]:
import unicodedata
import re
import json
import os
from requests import get
from bs4 import BeautifulSoup
import acquire
from time import strftime

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
from acquire import parse_blog
from acquire import get_article_text
from acquire import get_codeup_blogs
from acquire import get_inshorts_articles
from acquire import prep_text

## From acquire, use 'get_article_text' function and store results into variable.

In [None]:
original = get_article_text()

In [None]:
# Lowercase everything in the text.
article = original.lower()

In [None]:
print(article)

## Remove Accented Characters

Convert invalid characters into ASCII characters.
1. 'unicodedata.normalize' will remove inconsistencies in unicode character encoding.
2. '.encode' will convert the resulting string to the ASCII character set. 
3. '.decode' turns the resulting bytes object back into a string.

In [None]:
article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')

In [None]:
print(article)

## Remove Special Characters

In [None]:
# Remove anything that isn't a-z, a number, single quote, or whitespace.
article = re.sub(r"[^a-z0-9'\s]", '', article)

In [None]:
print(article)

## Tokenization
##### Use nltk to tokenize the strings.

In [None]:
tokenizer = nltk.tokenize.ToktokTokenizer()

In [None]:
tokenizer.tokenize(original, return_str=True)

## Stemming and Lemmatization
### Stemming
Reducing words to its root stem. The root stem may not always be an official word found in a dictionary.

In [None]:
# Create the nltk stemmer object, then use it
ps = nltk.porter.PorterStemmer()

ps.stem('call'), ps.stem('called'), ps.stem('calling')

In [None]:
# Apply stemming transformation to all the words in the article.
stems = [ps.stem(word) for word in article.split()]

In [None]:
# Join each word in 'stems' with a space.
article_stemmed = ' '.join(stems)

In [None]:
print(article_stemmed)

In [None]:
pd.Series(stems).value_counts().head(5)

## Lemmatization
The base form of a lemmatized word is the root word(lemma). Lemmas will always be present in dictionaries.

In [None]:
# Create lemmatizer object
wnl = nltk.stem.WordNetLemmatizer()

for word in 'study studies come coming eat eatery eating eaters'.split():
    print('stem:', ps.stem(word), '-- lemma:', wnl.lemmatize(word))

In [None]:
lemmas = [wnl.lemmatize(word) for word in article.split()]
article_lemmatized = ' '.join(lemmas)

In [None]:
print(article_lemmatized)

## Removing Stopwords
**stopword:** words that have little to no significance while constructing meaningful features from text.
* Articles, conjunctions, and prepositions are some examples of stopwords.

In [None]:
stopword_list = stopwords.words('english')
#stopword_list.remove('no')
#stopword_list.remove('not')

In [None]:
stopword_list

In [None]:
words = article.split()

In [None]:
filtered_words = [w for w in words if w not in stopword_list]

print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
print('---')

article_without_stopwords = ' '.join(filtered_words)

print(article_without_stopwords)

In [None]:
def basic_clean(string):
    '''
    This function takes in a string and returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    # Remove anything that isn't a-z, a number, single quote, or whitespace.
    string = re.sub(r"[^a-z0-9'\s]", '', string).lower()
    return string

In [None]:
def tokenize(string):
    '''
    This function takes in a string and returns a tokenized string.
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

In [None]:
def stem(string):
    '''
    This function takes in a string and returns a string with words stemmed.
    '''
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    article_stemmed = ' '.join(stems)
    return article_stemmed

In [None]:
def lemmatize(string):
    '''
    This function takes in a string and returns a string with words lemmatized.
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized

In [None]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters with default empty lists and returns a string.
    '''
    # Create a stopword list.
    stopword_list = stopwords.words('english')
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    # Add in 'extra_words' to stopword_list
    stopword_list = stopword_list.union(set(extra_words))
    # Split words in string.
    words = string.split()
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [w for w in words if w not in stopword_list]
    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')
    string_without_stopwords = ' '.join(filtered_words)
    return string_without_stopwords

In [None]:
codeup_df = get_codeup_blogs(cached=True)

In [None]:
codeup_df

In [None]:
# Create a a dataframe with the column 'content' dropped and run it
# through the newly created function to see if it performs as 
# expected.

# b = codeup_df.drop(columns='content',inplace = True)

In [None]:
prep_text(codeup_df, 'original')

In [2]:
news_df = get_inshorts_articles()

In [3]:
news_df

Unnamed: 0,title,author,original,date,category
0,"Omicron BA.2 found in 57 countries, doesn't se...",Apaar Sharma,Omicron BA.2 variant has been found in 57 coun...,2022-02-02,science
1,"Indian scientists develop self-disinfecting, w...",Ridham Gambhir,The Ministry of Science and Technology on Frid...,2022-02-04,science
2,"Astronaut spends continuous 300 days in space,...",Ridham Gambhir,NASA has revealed that its astronaut Mark Vand...,2022-02-04,science
3,9.9 crore-year-old flowers found perfectly pre...,Ankush Verma,Two 9.9 crore-year-old flowers have been found...,2022-02-02,science
4,Picture of Mars crater that looks like a tree ...,Pragya Swastik,The European Space Agency (ESA) has released a...,2022-02-02,science
...,...,...,...,...,...
120,Kangana has been a very supportive & endearing...,Udit Gupta,"Nawazuddin Siddiqui, who has wrapped up Kangan...",2022-02-04,entertainment
121,"Riteish Deshmukh, Genelia to star in comedy fi...",Udit Gupta,Riteish Deshmukh and his actress-wife Genelia ...,2022-02-04,entertainment
122,2022 will be a busy year: Disha Patani on upco...,Ramanpreet Singh Virdi,Actress Disha Patani has said 2022 will be a b...,2022-02-04,entertainment
123,Dharma Productions in talks to remake their ol...,Udit Gupta,Karan Johar's Dharma Productions is currently ...,2022-02-04,entertainment


In [4]:
prep_text(news_df, 'original')

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,"Omicron BA.2 found in 57 countries, doesn't se...",Omicron BA.2 variant has been found in 57 coun...,omicron ba2 variant found 57 countries said tu...,omicron ba2 variant ha found 57 countri said t...,omicron ba2 variant ha found 57 country said t...
1,"Indian scientists develop self-disinfecting, w...",The Ministry of Science and Technology on Frid...,ministry science technology friday announced t...,ministri scienc technolog friday announc team ...,ministry science technology friday announced t...
2,"Astronaut spends continuous 300 days in space,...",NASA has revealed that its astronaut Mark Vand...,nasa revealed astronaut mark vande hei lived s...,nasa ha reveal astronaut mark vand hei ha live...,nasa ha revealed astronaut mark vande hei ha l...
3,9.9 crore-year-old flowers found perfectly pre...,Two 9.9 crore-year-old flowers have been found...,two 99 croreyearold flowers found perfectly pr...,two 99 croreyearold flower found perfectli pre...,two 99 croreyearold flower found perfectly pre...
4,Picture of Mars crater that looks like a tree ...,The European Space Agency (ESA) has released a...,european space agency esa released picture mar...,european space agenc esa ha releas pictur mar ...,european space agency esa ha released picture ...
...,...,...,...,...,...
120,Kangana has been a very supportive & endearing...,"Nawazuddin Siddiqui, who has wrapped up Kangan...",nawazuddin siddiqui wrapped kangana ranaut ' f...,nawazuddin siddiqui ha wrap kangana ranaut ' f...,nawazuddin siddiqui ha wrapped kangana ranaut ...
121,"Riteish Deshmukh, Genelia to star in comedy fi...",Riteish Deshmukh and his actress-wife Genelia ...,riteish deshmukh actresswife genelia deshmukh ...,riteish deshmukh hi actresswif genelia deshmuk...,riteish deshmukh actresswife genelia deshmukh ...
122,2022 will be a busy year: Disha Patani on upco...,Actress Disha Patani has said 2022 will be a b...,actress disha patani said 2022 busy year ek vi...,actress disha patani ha said 2022 busi year ek...,actress disha patani ha said 2022 busy year ek...
123,Dharma Productions in talks to remake their ol...,Karan Johar's Dharma Productions is currently ...,karan johar ' dharma productions currently tal...,karan johar ' dharma product current talk rema...,karan johar ' dharma production currently talk...


In [5]:
news_df

Unnamed: 0,title,author,original,date,category,clean,stemmed,lemmatized
0,"Omicron BA.2 found in 57 countries, doesn't se...",Apaar Sharma,Omicron BA.2 variant has been found in 57 coun...,2022-02-02,science,omicron ba2 variant found 57 countries said tu...,omicron ba2 variant ha found 57 countri said t...,omicron ba2 variant ha found 57 country said t...
1,"Indian scientists develop self-disinfecting, w...",Ridham Gambhir,The Ministry of Science and Technology on Frid...,2022-02-04,science,ministry science technology friday announced t...,ministri scienc technolog friday announc team ...,ministry science technology friday announced t...
2,"Astronaut spends continuous 300 days in space,...",Ridham Gambhir,NASA has revealed that its astronaut Mark Vand...,2022-02-04,science,nasa revealed astronaut mark vande hei lived s...,nasa ha reveal astronaut mark vand hei ha live...,nasa ha revealed astronaut mark vande hei ha l...
3,9.9 crore-year-old flowers found perfectly pre...,Ankush Verma,Two 9.9 crore-year-old flowers have been found...,2022-02-02,science,two 99 croreyearold flowers found perfectly pr...,two 99 croreyearold flower found perfectli pre...,two 99 croreyearold flower found perfectly pre...
4,Picture of Mars crater that looks like a tree ...,Pragya Swastik,The European Space Agency (ESA) has released a...,2022-02-02,science,european space agency esa released picture mar...,european space agenc esa ha releas pictur mar ...,european space agency esa ha released picture ...
...,...,...,...,...,...,...,...,...
120,Kangana has been a very supportive & endearing...,Udit Gupta,"Nawazuddin Siddiqui, who has wrapped up Kangan...",2022-02-04,entertainment,nawazuddin siddiqui wrapped kangana ranaut ' f...,nawazuddin siddiqui ha wrap kangana ranaut ' f...,nawazuddin siddiqui ha wrapped kangana ranaut ...
121,"Riteish Deshmukh, Genelia to star in comedy fi...",Udit Gupta,Riteish Deshmukh and his actress-wife Genelia ...,2022-02-04,entertainment,riteish deshmukh actresswife genelia deshmukh ...,riteish deshmukh hi actresswif genelia deshmuk...,riteish deshmukh actresswife genelia deshmukh ...
122,2022 will be a busy year: Disha Patani on upco...,Ramanpreet Singh Virdi,Actress Disha Patani has said 2022 will be a b...,2022-02-04,entertainment,actress disha patani said 2022 busy year ek vi...,actress disha patani ha said 2022 busi year ek...,actress disha patani ha said 2022 busy year ek...
123,Dharma Productions in talks to remake their ol...,Udit Gupta,Karan Johar's Dharma Productions is currently ...,2022-02-04,entertainment,karan johar ' dharma productions currently tal...,karan johar ' dharma product current talk rema...,karan johar ' dharma production currently talk...


## Create a separate dataframe for each category.

In [15]:
news_df.category.unique()

array(['science', 'business', 'sports', 'technology', 'entertainment'],
      dtype=object)

In [13]:
news_df[news_df.category == 'science']

Unnamed: 0,title,author,original,date,category,clean,stemmed,lemmatized
0,"Omicron BA.2 found in 57 countries, doesn't se...",Apaar Sharma,Omicron BA.2 variant has been found in 57 coun...,2022-02-02,science,omicron ba2 variant found 57 countries said tu...,omicron ba2 variant ha found 57 countri said t...,omicron ba2 variant ha found 57 country said t...
1,"Indian scientists develop self-disinfecting, w...",Ridham Gambhir,The Ministry of Science and Technology on Frid...,2022-02-04,science,ministry science technology friday announced t...,ministri scienc technolog friday announc team ...,ministry science technology friday announced t...
2,"Astronaut spends continuous 300 days in space,...",Ridham Gambhir,NASA has revealed that its astronaut Mark Vand...,2022-02-04,science,nasa revealed astronaut mark vande hei lived s...,nasa ha reveal astronaut mark vand hei ha live...,nasa ha revealed astronaut mark vande hei ha l...
3,9.9 crore-year-old flowers found perfectly pre...,Ankush Verma,Two 9.9 crore-year-old flowers have been found...,2022-02-02,science,two 99 croreyearold flowers found perfectly pr...,two 99 croreyearold flower found perfectli pre...,two 99 croreyearold flower found perfectly pre...
4,Picture of Mars crater that looks like a tree ...,Pragya Swastik,The European Space Agency (ESA) has released a...,2022-02-02,science,european space agency esa released picture mar...,european space agenc esa ha releas pictur mar ...,european space agency esa ha released picture ...
5,Satellite images show the world's longest 768-...,Pragya Swastik,Satellite images have captured the 768-km-long...,2022-02-03,science,satellite images captured 768kmlong lightning ...,satellit imag captur 768kmlong lightn strike s...,satellite image captured 768kmlong lightning s...
6,Astronaut shares pics of clouds taken from spa...,Daisy Mowke,"Astronaut Kayla Barron, who is currently aboar...",2022-02-03,science,astronaut kayla barron currently aboard intern...,astronaut kayla barron current aboard intern s...,astronaut kayla barron currently aboard intern...
7,New 'highly virulent' variant of HIV discovere...,Ankush Verma,"Scientists have discovered a new ""highly virul...",2022-02-04,science,scientists discovered new highly virulent vari...,scientist discov new highli virul variant hiv ...,scientist discovered new highly virulent varia...
8,NASA to retire International Space Station by ...,Ridham Gambhir,NASA has announced that the International Spac...,2022-02-04,science,nasa announced international space station con...,nasa ha announc intern space station continu w...,nasa ha announced international space station ...
9,Scientists develop insect-sized flying robots ...,Aishwarya Awasthi,A University of Bristol team has developed sel...,2022-02-03,science,university bristol team developed selfdriving ...,univers bristol team ha develop selfdriv insec...,university bristol team ha developed selfdrivi...


In [18]:
for cat in news_df.category.unique():
    cat = news_df[news_df.category == cat]

In [22]:
list_of_dfs = []
for cat in news_df.category.unique():
    list_of_dfs.append(news_df[news_df.category == cat])

In [27]:
science = list_of_dfs[0]

In [29]:
business = list_of_dfs[1]

In [31]:
sports = list_of_dfs[2]

In [33]:
technology = list_of_dfs[3]

In [35]:
entertainment = list_of_dfs[4]