In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire
import prepare

In [2]:
import nltk

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bradleygauvin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bradleygauvin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/bradleygauvin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
nltk.corpus.stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
headers = {'User-Agent': 'Codeup Data Science - Kalpana'}
urls = ['https://codeup.com/data-science/jobs-after-a-coding-bootcamp-part-1-data-science/',
        'https://codeup.com/data-science/math-in-data-science/',
        'https://codeup.com/data-science/transition-into-data-science/',
        'https://codeup.com/data-science/data-science-career/',
        'https://codeup.com/data-science/data-science-without-a-degree/']
topics = ["business","sports","technology","entertainment"]

In [8]:
blogs = acquire.get_blogs(urls,headers)

articles = acquire.get_news(topics,headers)

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
    - Lowercase everything
    - Normalize unicode characters
    - Replace anything that is not a letter, number, whitespace or a single quote.
2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.
3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.
4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.
5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [13]:
def basic_clean(article:str):
    """ Performs basic cleaning of text string, article, by switching all letters to lowecase, normalizing unicode characters, 
    and replacing everything that is not a letter, number, whitespace, or single quote."""
    # Convert text to lowercase
    article = article.lower()
    
    # Remove accented characteries. Normalize removes inconsistencies in unicode character encoding.
    # Encode converts string to ASCII and decode returns the bytes into string.
    article = unicodedata.normalize('NFKD', article)    .encode('ascii', 'ignore')    .decode('utf-8', 'ignore')

    # remove anything that is not a through z, a number, a single quote, or whitespace
    article = re.sub(r"[^a-z0-9'\s]", '', article)
    
    return article

def tokenize(article:str):
    """ Takes in a string, article, and tokenizes all words """
    
    tokenizer = nltk.tokenize.ToktokTokenizer()

    return tokenizer.tokenize(article, return_str=True)

def stem(article: str):
    """ Takes in a string, article, and returns text after applying stemming using Porter method """
    
    ps = nltk.porter.PorterStemmer()

    stems = [ps.stem(word) for word in article.split()]
    article_stemmed = ' '.join(stems)
    
    return article_stemmed

def lemmatize(article: str):
    """ Accepts string as argument, article, and returns text after applying lemmatization to each word """
    
    wnl = nltk.stem.WordNetLemmatizer()
        
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    article_lemmatized = ' '.join(lemmas)

    return article_lemmatized

def remove_stopwords(article: str, extra_words: list, exclude_words: list):
    """ Accepts string (article) as argument and returns text after removing all the stopwords.
    extra_words: any additional stop words to include (these words will be removed from the article)
    exclude_words: any words we do not want to remove. These words are removed from the stopwords list and will remain in article """
    
    stopword_list = stopwords.words('english')

    [stopword_list.append(word_to_add) for word_to_add in extra_words if word_to_add not in stopword_list]
    [stopword_list.remove(to_remove) for to_remove in exclude_words if to_remove in stopword_list]

    words = article.split()
    filtered_words = [w for w in words if w not in stopword_list]

    # print('Removed {} stopwords'.format(len(words) - len(filtered_words)))

    article_without_stopwords = ' '.join(filtered_words)
    
    return article_without_stopwords

In [9]:
blogs.head()

Unnamed: 0,url,title,content
0,https://codeup.com/data-science/jobs-after-a-c...,What Jobs Can You Get After a Coding Bootcamp?,\nIf you are interested in embarking on a care...
1,https://codeup.com/data-science/math-in-data-s...,What are the Math and Stats Principles You Nee...,"\nComing into our Data Science program, you wi..."
2,https://codeup.com/data-science/transition-int...,What is the Transition into Data Science Like?...,\nAlumni Katy Salts and Brandi Reger joined us...
3,https://codeup.com/data-science/data-science-c...,What Data Science Career is For You? - Codeup ...,\nIf you’re struggling to see yourself as a da...
4,https://codeup.com/data-science/data-science-w...,From Slacker to Data Scientist: Journey to Dat...,\nButterflies in my belly; my stomach is tied ...


In [10]:
articles.head()

Unnamed: 0,title,content,category
0,India's GDP grows at 13.5% in first quarter of...,India's GDP grew at 13.5% in the first quarter...,business
1,Musk cites whistleblower's claims in new notic...,Tesla CEO Elon Musk's legal team has filed ano...,business
2,Musk seeks to delay Twitter trial to Nov amid ...,Tesla CEO Elon Musk is seeking to delay the tr...,business
3,2 top executives at Snap quit hours after repo...,Two senior advertising executives at Snap quit...,business
4,Viral video shows Amazon parcels thrown out of...,A video from Guwahati railway station has gone...,business


6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [14]:
news_df = prepare.create_prepared_news_df()

Importing from csv


In [15]:
news_df.lemmatized[3]

'two senior advertising executive snap quit hour report emerged stating company planning cut 20 workforce earlier report verge citing source said snap planning lay 20 6400 employee reported weak q2 result'

In [16]:
df = acquire.get_blog_articles(True)

In [17]:
df = pd.DataFrame(df)

In [19]:
df.head()

Unnamed: 0,url,title,date_published,original
0,https://codeup.com/data-science/recession-proo...,"Is a Career in Tech Recession-Proof?\nAug 12, ...","Aug 12, 2022","Given the current economic climate, many econo..."
1,https://codeup.com/codeup-news/codeup-x-comic-...,Codeup X Superhero Car Show & Comic Con\nAug 1...,"Aug 10, 2022",Codeup had a blast at the San Antonio Superher...
2,https://codeup.com/featured/series-part-3-web-...,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...
3,https://codeup.com/codeup-news/codeup-dallas-c...,"Codeup’s New Dallas Campus\nJul 25, 2022 | Cod...","Jul 25, 2022",Codeup’s Dallas campus has a new location! For...
4,https://codeup.com/codeup-news/codeup-tv-comme...,"Codeup TV Commercial\nJul 20, 2022 | Codeup Ne...","Jul 20, 2022",Codeup has officially made its TV debut! Our c...


In [20]:
original = df.original[0]
article = original.lower()
print(prepare.basic_clean(article))

given the current economic climate many economists are considering the us to be entering a recession this can cause confusion fear and uncertainty especially as it pertains to job security
to ease some of those feelings below youll find some careers in tech that tend to hold up better than others amid a recession in the event of a recession companies will likely shift to digital strategies making these careers in tech valuable and highly coveted
 


programmerdeveloper
no matter the programming language youve mastered having the knowledge alone makes you extremely valuable the coding skills you possess as a programmer or developer are indemand for companies looking to build or enhance their websites and enhance their consumer experience according to the us bureau of labor statistics jobs in software development are expected to grow 22 by 2030 this is much faster than the average career

 


cloud administrator
more businesses are transitioning to cloud servers and this increase sparks 

In [21]:
tokenizer = nltk.tokenize.ToktokTokenizer()

print(tokenizer.tokenize(original, return_str=True))

Given the current economic climate , many economists are considering the U.S. to be entering a recession. This can cause confusion , fear , and uncertainty , especially as it pertains to job security.
To ease some of those feelings , below you ’ ll find some careers in tech that tend to hold up better than others amid a recession. In the event of a recession , companies will likely shift to digital strategies , making these careers in tech valuable and highly coveted.
 


Programmer/Developer
No matter the programming language you ’ ve mastered , having the knowledge alone makes you extremely valuable. The coding skills you possess as a programmer or developer are in-demand for companies looking to build or enhance their websites , and enhance their consumer experience. According to the U.S. Bureau of Labor Statistics , jobs in software development are expected to grow 22 % by 2030. This is much faster than the average career.

 


Cloud Administrator
More businesses are transitioning 

In [22]:
stems = prepare.stem(original)
stems

'given the current econom climate, mani economist are consid the u.s. to be enter a recession. thi can caus confusion, fear, and uncertainty, especi as it pertain to job security. to eas some of those feelings, below you’ll find some career in tech that tend to hold up better than other amid a recession. in the event of a recession, compani will like shift to digit strategies, make these career in tech valuabl and highli coveted. programmer/develop no matter the program languag you’v mastered, have the knowledg alon make you extrem valuable. the code skill you possess as a programm or develop are in-demand for compani look to build or enhanc their websites, and enhanc their consum experience. accord to the u.s. bureau of labor statistics, job in softwar develop are expect to grow 22% by 2030. thi is much faster than the averag career. cloud administr more busi are transit to cloud server and thi increas spark the need for cloud administr to maintain and updat the cloud infrastructure. 

In [23]:
prepare.lemmatize(original)
original

'Given the current economic climate, many economists are considering the U.S. to be entering a recession. This can cause confusion, fear, and uncertainty, especially as it pertains to job security.\nTo ease some of those feelings, below you’ll find some careers in tech that tend to hold up better than others amid a recession. In the event of a recession, companies will likely shift to digital strategies, making these careers in tech valuable and highly coveted.\n\xa0\n\n\nProgrammer/Developer\nNo matter the programming language you’ve mastered, having the knowledge alone makes you extremely valuable. The coding skills you possess as a programmer or developer are in-demand for companies looking to build or enhance their websites, and enhance their consumer experience. According to the U.S. Bureau of Labor Statistics, jobs in software development are expected to grow 22% by 2030. This is much faster than the average career.\n\n\xa0\n\n\nCloud Administrator\nMore businesses are transition

In [24]:
prepare.remove_stopwords(original, extra_words = ['Taryn', 'Month','chat'], exclude_words= ['for','We'])

'Given current economic climate, many economists considering U.S. entering recession. This cause confusion, fear, uncertainty, especially pertains job security. To ease feelings, you’ll find careers tech tend hold better others amid recession. In event recession, companies likely shift digital strategies, making careers tech valuable highly coveted. Programmer/Developer No matter programming language you’ve mastered, knowledge alone makes extremely valuable. The coding skills possess programmer developer in-demand for companies looking build enhance websites, enhance consumer experience. According U.S. Bureau Labor Statistics, jobs software development expected grow 22% 2030. This much faster average career. Cloud Administrator More businesses transitioning cloud servers increase sparks need for cloud administrators maintain update cloud infrastructure. Amid recession, likely efforts regarding security heightened company strategies shift digitally, managing secure server responsibility

7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.
8. For each dataframe, produce the following columns:
    - title to hold the title
    - original to hold the original article/post content
    - clean to hold the normalized and tokenized original with the stopwords removed.
    - stemmed to hold the stemmed version of the cleaned data.
    - lemmatized to hold the lemmatized version of the cleaned data.

In [11]:
codeup_df = prepare.create_prepared_blog_df()

In [12]:
codeup_df.head()

Unnamed: 0,url,title,date_published,original,clean,stemmed,lemmatized
0,https://codeup.com/data-science/recession-proo...,"Is a Career in Tech Recession-Proof?\nAug 12, ...","Aug 12, 2022","Given the current economic climate, many econo...",given current economic climate many economists...,given current econom climat mani economist con...,given current economic climate many economist ...
1,https://codeup.com/codeup-news/codeup-x-comic-...,Codeup X Superhero Car Show & Comic Con\nAug 1...,"Aug 10, 2022",Codeup had a blast at the San Antonio Superher...,codeup blast san antonio superhero car show co...,codeup blast san antonio superhero car show co...,codeup blast san antonio superhero car show co...
2,https://codeup.com/featured/series-part-3-web-...,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",If you’re considering a career in web developm...,youre considering career web development dont ...,consid career web develop dont know expect con...,youre considering career web development dont ...
3,https://codeup.com/codeup-news/codeup-dallas-c...,"Codeup’s New Dallas Campus\nJul 25, 2022 | Cod...","Jul 25, 2022",Codeup’s Dallas campus has a new location! For...,codeups dallas campus new location two years c...,codeup dalla campu new locat two year codeup o...,codeups dallas campus new location two year co...
4,https://codeup.com/codeup-news/codeup-tv-comme...,"Codeup TV Commercial\nJul 20, 2022 | Codeup Ne...","Jul 20, 2022",Codeup has officially made its TV debut! Our c...,codeup officially made tv debut community stud...,codeup offici made tv debut commun student sta...,codeup officially made tv debut community stud...


9. Ask yourself:

    - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
        - Lemmatized
    - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
        - Lemmatized, may want to consider stemm.
    - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
        - Stemmed