In [1]:
import unicodedata
import re
import json
import os
from requests import get
from bs4 import BeautifulSoup
import acquire

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords


import pandas as pd
from acquire import get_codeup_blog_urls
from acquire import parse_blog
from acquire import get_article_text
from acquire import get_codeup_blogs
from acquire import get_inshorts

## From acquire, use 'get_article_text' function and store results into variable.

In [2]:
original = get_article_text()

In [3]:
# Lowercase everything in the text.
article = original.lower()

In [None]:
print(article)

## Remove Accented Characters

Convert invalid characters into ASCII characters.
1. 'unicodedata.normalize' will remove inconsistencies in unicode character encoding.
2. '.encode' will convert the resulting string to the ASCII character set. 
3. '.decode' turns the resulting bytes object back into a string.

In [4]:
article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')

In [None]:
print(article)

## Remove Special Characters

In [5]:
# Remove anything that isn't a-z, a number, single quote, or whitespace.
article = re.sub(r"[^a-z0-9'\s]", '', article)

In [None]:
print(article)

## Tokenization
##### Use nltk to tokenize the strings.

In [6]:
tokenizer = nltk.tokenize.ToktokTokenizer()

In [7]:
tokenizer.tokenize(original, return_str=True)

'What are the Math and Stats Principles You Need for Data Science ? \nOct 21 , 2020 &#124; Data Science\n\n\nComing into our Data Science program , you will need to know some math and stats. However , many of our applicants actually learn in the application process – you don ’ t need to be an expert before applying ! Data science is a very accessible field to anyone dedicated to learning new skills , and we can work with any applicant to help them learn what they need to know. But what “skills ” do we mean , exactly ? Just what exactly are the data science math and stats principles you need to know ? \nWhat are the main math principles you need to know to get into Codeup ’ s Data Science program ? \n\n\nAlgebra\nDo you know PEMDAS and can you solve for x ? You will need to be or become comfortable with the following : \n\nVariables ( x , y , n , etc. ) \nFormulas , functions , and variable manipulations ( e.g. x^2 = x + 6 , solve for x ) .\nOrder of evaluation : PEMDAS : parentheses , 

## Stemming and Lemmatization
### Stemming
Reducing words to its root stem. The root stem may not always be an official word found in a dictionary.

In [None]:
# Create the nltk stemmer object, then use it
ps = nltk.porter.PorterStemmer()

ps.stem('call'), ps.stem('called'), ps.stem('calling')

In [None]:
# Apply stemming transformation to all the words in the article.
stems = [ps.stem(word) for word in article.split()]

In [None]:
# Join each word in 'stems' with a space.
article_stemmed = ' '.join(stems)

In [None]:
print(article_stemmed)

In [None]:
pd.Series(stems).value_counts().head(5)

## Lemmatization
The base form of a lemmatized word is the root word(lemma). Lemmas will always be present in dictionaries.

In [None]:
# Create lemmatizer object
wnl = nltk.stem.WordNetLemmatizer()

for word in 'study studies come coming eat eatery eating eaters'.split():
    print('stem:', ps.stem(word), '-- lemma:', wnl.lemmatize(word))

In [None]:
lemmas = [wnl.lemmatize(word) for word in article.split()]
article_lemmatized = ' '.join(lemmas)

In [None]:
print(article_lemmatized)

## Removing Stopwords
**stopword:** words that have little to no significance while constructing meaningful features from text.
* Articles, conjunctions, and prepositions are some examples of stopwords.

In [None]:
stopword_list = stopwords.words('english')
#stopword_list.remove('no')
#stopword_list.remove('not')

In [None]:
stopword_list

In [None]:
words = article.split()

In [None]:
filtered_words = [w for w in words if w not in stopword_list]

print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
print('---')

article_without_stopwords = ' '.join(filtered_words)

print(article_without_stopwords)

In [23]:
def basic_clean(string):
    # Lowercase everything in the text.
    lower = string.lower()
    lower = unicodedata.normalize('NFKD', lower)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    # Remove anything that isn't a-z, a number, single quote, or whitespace.
    cleaned = re.sub(r"[^a-z0-9'\s]", '', lower)
    return cleaned

In [15]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

In [16]:
def stem(string):
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    article_stemmed = ' '.join(stems)
    return article_stemmed

In [17]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized

In [18]:
def remove_stopwords(string):
    stopword_list = stopwords.words('english')
    words = string.split()
    filtered_words = [w for w in words if w not in stopword_list]
    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')
    string_without_stopwords = ' '.join(filtered_words)
    return string_without_stopwords

In [8]:
urls = get_codeup_blog_urls()



  soup = BeautifulSoup(response.text)


In [9]:
urls

[<a class="more-link" href="https://codeup.com/dallas-newsletter/codeup-dallas-open-house/">read more</a>,
 <a class="more-link" href="https://codeup.com/codeup-news/codeups-placement-team-continues-setting-records/">read more</a>,
 <a class="more-link" href="https://codeup.com/it-training/it-certifications-101/">read more</a>,
 <a class="more-link" href="https://codeup.com/cybersecurity/a-rise-in-cyber-attacks-means-opportunities-for-veterans-in-san-antonio/">read more</a>,
 <a class="more-link" href="https://codeup.com/codeup-news/use-your-gi-bill-benefits-to-land-a-job-in-tech/">read more</a>,
 <a class="more-link" href="https://codeup.com/tips-for-prospective-students/which-program-is-right-for-me-cyber-security-or-systems-engineering/">read more</a>,
 <a class="more-link" href="https://codeup.com/it-training/what-the-heck-is-system-engineering/">read more</a>,
 <a class="more-link" href="https://codeup.com/alumni-stories/from-speech-pathology-to-business-intelligence/">read more</

In [83]:
codeup_df = get_codeup_blogs(urls)



  blog = BeautifulSoup(response.text)


In [84]:
codeup_df

Unnamed: 0,title,date & source,content
0,Codeup Dallas Open House,"Nov 30, 2021 | Dallas Newsletter, Events",\nCome join us for the re-opening of our Dalla...
1,Codeup’s Placement Team Continues Setting Records,"Nov 19, 2021 | Codeup News, Employers",\n\n\n\n\n\nOur Placement Team is simply defin...
2,"IT Certifications 101: Why They Matter, and Wh...","Nov 18, 2021 | IT Training, Tips for Prospecti...","\n\n\n\n\n\nAWS, Google, Azure, Red Hat, CompT..."
3,A rise in cyber attacks means opportunities fo...,"Nov 17, 2021 | Cybersecurity","\nIn the last few months, the US has experienc..."
4,Use your GI Bill® benefits to Land a Job in Tech,"Nov 4, 2021 | Codeup News, Tips for Prospectiv...",\n\n\n\n\n\nAs the end of military service get...
5,Which program is right for me: Cyber Security ...,"Oct 28, 2021 | IT Training, Tips for Prospecti...",\n\n\n\n\n\nWhat IT Career should I choose?\nI...
6,What the Heck is System Engineering?,"Oct 21, 2021 | IT Training, Tips for Prospecti...",\n\n\n\n\n\nCodeup offers a 13-week training p...
7,From Speech Pathology to Business Intelligence,"Oct 18, 2021 | Alumni Stories",\n\n\n\n\n\nBy: Alicia Gonzalez\nBefore Codeup...
8,Boris – Behind the Billboards,"Oct 3, 2021 | Behind the Billboards",\n\n\n
9,Is Codeup the Best Bootcamp in San Antonio…or ...,"Sep 16, 2021 | Codeup News, Featured",\n\n\n\n\n\nLooking for the best data science ...


In [77]:
def prep_text(df):
    if 'content' in df.columns:
        df.content = df.content.str.replace('\n',' ')
        df.content = df.content.str.strip()
        df['clean'] = df.content.apply(basic_clean)
        df['stemmed'] = df.content.apply(stem)
        df['lemmatized'] = df.content.apply(lemmatize)
        return df
    else:
        print("Dataframe does not have required column 'content'.")

In [None]:
# Create a a dataframe with the column 'content' dropped and run it
# through the newly created function to see if it performs as 
# expected.

# b = codeup_df.drop(columns='content',inplace = True)

In [85]:
prep_text(codeup_df)

Unnamed: 0,title,date & source,content,clean,stemmed,lemmatized
0,Codeup Dallas Open House,"Nov 30, 2021 | Dallas Newsletter, Events",Come join us for the re-opening of our Dallas ...,come join us for the reopening of our dallas c...,come join us for the re-open of our dalla camp...,Come join u for the re-opening of our Dallas C...
1,Codeup’s Placement Team Continues Setting Records,"Nov 19, 2021 | Codeup News, Employers",Our Placement Team is simply defined as a grou...,our placement team is simply defined as a grou...,our placement team is simpli defin as a group ...,Our Placement Team is simply defined a a group...
2,"IT Certifications 101: Why They Matter, and Wh...","Nov 18, 2021 | IT Training, Tips for Prospecti...","AWS, Google, Azure, Red Hat, CompTIA…these are...",aws google azure red hat comptiathese are big ...,"aws, google, azure, red hat, comptia…thes are ...","AWS, Google, Azure, Red Hat, CompTIA…these are..."
3,A rise in cyber attacks means opportunities fo...,"Nov 17, 2021 | Cybersecurity","In the last few months, the US has experienced...",in the last few months the us has experienced ...,"in the last few months, the us ha experienc do...","In the last few months, the US ha experienced ..."
4,Use your GI Bill® benefits to Land a Job in Tech,"Nov 4, 2021 | Codeup News, Tips for Prospectiv...","As the end of military service gets closer, ma...",as the end of military service gets closer man...,"as the end of militari servic get closer, mani...","As the end of military service get closer, man..."
5,Which program is right for me: Cyber Security ...,"Oct 28, 2021 | IT Training, Tips for Prospecti...",What IT Career should I choose? If you’re thin...,what it career should i choose if youre thinki...,what it career should i choose? if you’r think...,What IT Career should I choose? If you’re thin...
6,What the Heck is System Engineering?,"Oct 21, 2021 | IT Training, Tips for Prospecti...",Codeup offers a 13-week training program: Syst...,codeup offers a 13week training program system...,codeup offer a 13-week train program: system e...,Codeup offer a 13-week training program: Syste...
7,From Speech Pathology to Business Intelligence,"Oct 18, 2021 | Alumni Stories","By: Alicia Gonzalez Before Codeup, I was a hom...",by alicia gonzalez before codeup i was a home ...,"by: alicia gonzalez befor codeup, i wa a home ...","By: Alicia Gonzalez Before Codeup, I wa a home..."
8,Boris – Behind the Billboards,"Oct 3, 2021 | Behind the Billboards",,,,
9,Is Codeup the Best Bootcamp in San Antonio…or ...,"Sep 16, 2021 | Codeup News, Featured",Looking for the best data science bootcamp in ...,looking for the best data science bootcamp in ...,look for the best data scienc bootcamp in the ...,Looking for the best data science bootcamp in ...


In [56]:
codeup_df.content = codeup_df.content.str.replace('\n',' ')

In [57]:
codeup_df.content = codeup_df.content.str.strip()

In [59]:
codeup_df['clean'] = codeup_df.content.apply(basic_clean)

In [62]:
codeup_df['stemmed'] = codeup_df.content.apply(stem)

In [67]:
codeup_df['lemmatized'] = codeup_df.content.apply(lemmatize)

In [86]:
codeup_df

Unnamed: 0,title,date & source,content,clean,stemmed,lemmatized
0,Codeup Dallas Open House,"Nov 30, 2021 | Dallas Newsletter, Events",Come join us for the re-opening of our Dallas ...,come join us for the reopening of our dallas c...,come join us for the re-open of our dalla camp...,Come join u for the re-opening of our Dallas C...
1,Codeup’s Placement Team Continues Setting Records,"Nov 19, 2021 | Codeup News, Employers",Our Placement Team is simply defined as a grou...,our placement team is simply defined as a grou...,our placement team is simpli defin as a group ...,Our Placement Team is simply defined a a group...
2,"IT Certifications 101: Why They Matter, and Wh...","Nov 18, 2021 | IT Training, Tips for Prospecti...","AWS, Google, Azure, Red Hat, CompTIA…these are...",aws google azure red hat comptiathese are big ...,"aws, google, azure, red hat, comptia…thes are ...","AWS, Google, Azure, Red Hat, CompTIA…these are..."
3,A rise in cyber attacks means opportunities fo...,"Nov 17, 2021 | Cybersecurity","In the last few months, the US has experienced...",in the last few months the us has experienced ...,"in the last few months, the us ha experienc do...","In the last few months, the US ha experienced ..."
4,Use your GI Bill® benefits to Land a Job in Tech,"Nov 4, 2021 | Codeup News, Tips for Prospectiv...","As the end of military service gets closer, ma...",as the end of military service gets closer man...,"as the end of militari servic get closer, mani...","As the end of military service get closer, man..."
5,Which program is right for me: Cyber Security ...,"Oct 28, 2021 | IT Training, Tips for Prospecti...",What IT Career should I choose? If you’re thin...,what it career should i choose if youre thinki...,what it career should i choose? if you’r think...,What IT Career should I choose? If you’re thin...
6,What the Heck is System Engineering?,"Oct 21, 2021 | IT Training, Tips for Prospecti...",Codeup offers a 13-week training program: Syst...,codeup offers a 13week training program system...,codeup offer a 13-week train program: system e...,Codeup offer a 13-week training program: Syste...
7,From Speech Pathology to Business Intelligence,"Oct 18, 2021 | Alumni Stories","By: Alicia Gonzalez Before Codeup, I was a hom...",by alicia gonzalez before codeup i was a home ...,"by: alicia gonzalez befor codeup, i wa a home ...","By: Alicia Gonzalez Before Codeup, I wa a home..."
8,Boris – Behind the Billboards,"Oct 3, 2021 | Behind the Billboards",,,,
9,Is Codeup the Best Bootcamp in San Antonio…or ...,"Sep 16, 2021 | Codeup News, Featured",Looking for the best data science bootcamp in ...,looking for the best data science bootcamp in ...,look for the best data scienc bootcamp in the ...,Looking for the best data science bootcamp in ...


In [11]:
news_df = get_inshorts()



  soup = BeautifulSoup(response.text)


In [87]:
news_df

Unnamed: 0,title,category,author,content
0,"Omicron BA.2 found in 57 countries, doesn't se...",science,Apaar Sharma,Omicron BA.2 variant has been found in 57 coun...
1,Picture of Mars crater that looks like a tree ...,science,Pragya Swastik,The European Space Agency (ESA) has released a...
2,Satellite images show the world's longest 768-...,science,Pragya Swastik,Satellite images have captured the 768-km-long...
3,9.9 crore-year-old flowers found perfectly pre...,science,Ankush Verma,Two 9.9 crore-year-old flowers have been found...
4,Astronaut shares pics of clouds taken from spa...,science,Daisy Mowke,"Astronaut Kayla Barron, who is currently aboar..."
...,...,...,...,...
120,"Told Deepika won't click pic with you, I'll do...",entertainment,Kriti Kambiri,Actor Dhairya Karwa revealed that he went to D...
121,"Troll asks Sara 'Why are your shayris bad?', s...",entertainment,Mahima Kharbanda,"Taking to Instagram, actress Sara Ali Khan sha..."
122,"Brotherhood created with Tom, Tobey over diffi...",entertainment,Kriti Kambiri,Actor Andrew Garfield revealed that a brotherh...
123,"Javed confirms Farhan-Shibani's wedding, will ...",entertainment,Kriti Kambiri,Lyricist Javed Akhtar has confirmed actor Farh...


In [88]:
prep_text(news_df)

Unnamed: 0,title,category,author,content,clean,stemmed,lemmatized
0,"Omicron BA.2 found in 57 countries, doesn't se...",science,Apaar Sharma,Omicron BA.2 variant has been found in 57 coun...,omicron ba2 variant has been found in 57 count...,omicron ba.2 variant ha been found in 57 count...,Omicron BA.2 variant ha been found in 57 count...
1,Picture of Mars crater that looks like a tree ...,science,Pragya Swastik,The European Space Agency (ESA) has released a...,the european space agency esa has released a p...,the european space agenc (esa) ha releas a pic...,The European Space Agency (ESA) ha released a ...
2,Satellite images show the world's longest 768-...,science,Pragya Swastik,Satellite images have captured the 768-km-long...,satellite images have captured the 768kmlong l...,satellit imag have captur the 768-km-long ligh...,Satellite image have captured the 768-km-long ...
3,9.9 crore-year-old flowers found perfectly pre...,science,Ankush Verma,Two 9.9 crore-year-old flowers have been found...,two 99 croreyearold flowers have been found pe...,two 9.9 crore-year-old flower have been found ...,Two 9.9 crore-year-old flower have been found ...
4,Astronaut shares pics of clouds taken from spa...,science,Daisy Mowke,"Astronaut Kayla Barron, who is currently aboar...",astronaut kayla barron who is currently aboard...,"astronaut kayla barron, who is current aboard ...","Astronaut Kayla Barron, who is currently aboar..."
...,...,...,...,...,...,...,...
120,"Told Deepika won't click pic with you, I'll do...",entertainment,Kriti Kambiri,Actor Dhairya Karwa revealed that he went to D...,actor dhairya karwa revealed that he went to d...,actor dhairya karwa reveal that he went to dee...,Actor Dhairya Karwa revealed that he went to D...
121,"Troll asks Sara 'Why are your shayris bad?', s...",entertainment,Mahima Kharbanda,"Taking to Instagram, actress Sara Ali Khan sha...",taking to instagram actress sara ali khan shar...,"take to instagram, actress sara ali khan share...","Taking to Instagram, actress Sara Ali Khan sha..."
122,"Brotherhood created with Tom, Tobey over diffi...",entertainment,Kriti Kambiri,Actor Andrew Garfield revealed that a brotherh...,actor andrew garfield revealed that a brotherh...,actor andrew garfield reveal that a brotherhoo...,Actor Andrew Garfield revealed that a brotherh...
123,"Javed confirms Farhan-Shibani's wedding, will ...",entertainment,Kriti Kambiri,Lyricist Javed Akhtar has confirmed actor Farh...,lyricist javed akhtar has confirmed actor farh...,lyricist jave akhtar ha confirm actor farhan a...,Lyricist Javed Akhtar ha confirmed actor Farha...


In [89]:
news_df

Unnamed: 0,title,category,author,content,clean,stemmed,lemmatized
0,"Omicron BA.2 found in 57 countries, doesn't se...",science,Apaar Sharma,Omicron BA.2 variant has been found in 57 coun...,omicron ba2 variant has been found in 57 count...,omicron ba.2 variant ha been found in 57 count...,Omicron BA.2 variant ha been found in 57 count...
1,Picture of Mars crater that looks like a tree ...,science,Pragya Swastik,The European Space Agency (ESA) has released a...,the european space agency esa has released a p...,the european space agenc (esa) ha releas a pic...,The European Space Agency (ESA) ha released a ...
2,Satellite images show the world's longest 768-...,science,Pragya Swastik,Satellite images have captured the 768-km-long...,satellite images have captured the 768kmlong l...,satellit imag have captur the 768-km-long ligh...,Satellite image have captured the 768-km-long ...
3,9.9 crore-year-old flowers found perfectly pre...,science,Ankush Verma,Two 9.9 crore-year-old flowers have been found...,two 99 croreyearold flowers have been found pe...,two 9.9 crore-year-old flower have been found ...,Two 9.9 crore-year-old flower have been found ...
4,Astronaut shares pics of clouds taken from spa...,science,Daisy Mowke,"Astronaut Kayla Barron, who is currently aboar...",astronaut kayla barron who is currently aboard...,"astronaut kayla barron, who is current aboard ...","Astronaut Kayla Barron, who is currently aboar..."
...,...,...,...,...,...,...,...
120,"Told Deepika won't click pic with you, I'll do...",entertainment,Kriti Kambiri,Actor Dhairya Karwa revealed that he went to D...,actor dhairya karwa revealed that he went to d...,actor dhairya karwa reveal that he went to dee...,Actor Dhairya Karwa revealed that he went to D...
121,"Troll asks Sara 'Why are your shayris bad?', s...",entertainment,Mahima Kharbanda,"Taking to Instagram, actress Sara Ali Khan sha...",taking to instagram actress sara ali khan shar...,"take to instagram, actress sara ali khan share...","Taking to Instagram, actress Sara Ali Khan sha..."
122,"Brotherhood created with Tom, Tobey over diffi...",entertainment,Kriti Kambiri,Actor Andrew Garfield revealed that a brotherh...,actor andrew garfield revealed that a brotherh...,actor andrew garfield reveal that a brotherhoo...,Actor Andrew Garfield revealed that a brotherh...
123,"Javed confirms Farhan-Shibani's wedding, will ...",entertainment,Kriti Kambiri,Lyricist Javed Akhtar has confirmed actor Farh...,lyricist javed akhtar has confirmed actor farh...,lyricist jave akhtar ha confirm actor farhan a...,Lyricist Javed Akhtar ha confirmed actor Farha...
