In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire
import prepare

### 1.) Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

 - Lowercase everything
 - normalize unicode characters
 - replace anything that is not a letter, number, whitespace, or a single quote

In [2]:
def basic_clean(stringcheese):
    stringcheese = stringcheese.lower()
    stringcheese = unicodedata.normalize('NFKD', stringcheese)\
        .encode('ascii','ignore')\
        .decode('utf-8')
    stringcheese = re.sub(r"[^a-z0-9'\s]", '', stringcheese)
    return stringcheese

In [3]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [4]:
clean_original = basic_clean(original)
clean_original

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [5]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r"[^\w\s']", '', string).lower()
    return string

In [6]:
clean_original = basic_clean(original)
clean_original

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### 2.) Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [7]:
def tokenize(stringcheese):
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()

    # Use the tokenizer
    stringcheese = tokenizer.tokenize(stringcheese, return_str=True)

    return stringcheese

In [8]:
tokenize(clean_original)

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [9]:
tokenized = tokenize(clean_original)

### 3.) Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [10]:
def stem(stringcheese):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in stringcheese.split()]
    stems = ' '.join(stems)
    return stems

In [11]:
stemmed = stem(tokenized)
stemmed

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### 4.) Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [12]:
def lemmatize(stringcheese):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in stringcheese.split()]
    lemmas = ' '.join(lemmas)
    return lemmas

In [13]:
lemmatized = lemmatize(stemmed)
lemmatized

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written a erdo or erdo either by mistak or out of typograph necess"

### 5.) Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [14]:
def remove_stopwords(words, extra_words = [], exclude_words = []):
    stopword_list = stopwords.words('english')
    
    stopword_list = set(stopword_list) - set(exclude_words)
    stopword_list = stopword_list.union(set(extra_words))
    
    words = words.split()
    filtered_words = [word for word in words if word not in stopword_list]
    article_without_stopwords = ' '.join(filtered_words)
    return article_without_stopwords

In [15]:
remove_stopwords(lemmatized)

"paul erdo georg polya influenti hungarian mathematician contribut lot field erdo ' name contain hungarian letter ' ' ' ' doubl acut accent often incorrectli written erdo erdo either mistak typograph necess"

### 6.) Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [16]:
categories = ["business", "sports", "technology", "entertainment", "science", "world"]
news_df = acquire.get_all_news_articles(categories)

In [17]:
news_df

Unnamed: 0,title,content,category
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,business
1,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,business
2,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,business
3,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,business
4,Govt may lower import duty on EVs if Tesla man...,The government is open to consider reducing im...,business
...,...,...,...
143,UN panel uses $600mn in Iraqi funds to pay Kuw...,A UN commission on Tuesday used $600 million i...,world
144,1st person charged under Hong Kong national se...,The first person to be tried under the nationa...,world
145,"Former US Senator Barbara Boxer assaulted, rob...",Former US Senator Barbara Boxer was assaulted ...,world
146,"Gunmen on motorbikes attack village in Niger, ...",Armed men on motorbikes have killed at least 1...,world


In [18]:
news_df.content[0]

'Reliance Industries has said in a statement that over 98% of its workers have received at least one dose of COVID-19 vaccine so far. The billionaire Mukesh Ambani-led conglomerate had over 2.36 lakh employees, of March 31. Besides Reliance, Hindustan Unilever has also given at least one shot to 90% of employees, while Infosys inoculated 59% employees and TCS 70%.'

### 7.) Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [19]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/', 
        'https://codeup.com/data-science-myths/', 
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/', 
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/', 
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

In [20]:
codeup_df = pd.DataFrame(acquire.get_blog_articles(urls))

In [21]:
codeup_df

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


### 8.) For each dataframe, produce the following columns:
 - title to hold the title
 - original to hold the original article/post content
 - clean to hold the normalized and tokenized original with the stopwords removed.
 - stemmed to hold the stemmed version of the cleaned data.
 - lemmatized to hold the lemmatized version of the cleaned data.

In [22]:
def prep_article_data(df, content, extra_words=[], exclude_words=[]):
    df['title'] = df.title
    df['original']= df.content   
    df['clean'] = df[content].apply(basic_clean)\
                    .apply(tokenize)\
                    .apply(remove_stopwords)
    
    df['stemmed']= df[content].apply(basic_clean)\
                    .apply(tokenize)\
                    .apply(stem)\
                    .apply(remove_stopwords,extra_words = extra_words, exclude_words = exclude_words)
    
    df['lemmatized'] = df[content].apply(basic_clean)\
                    .apply(tokenize)\
                    .apply(lemmatize)\
                    .apply(remove_stopwords,extra_words = extra_words, exclude_words = exclude_words)
    return df[['title', 'original','clean','stemmed','lemmatized']] 

In [23]:
prep_article_data(news_df, 'content', extra_words=[], exclude_words=[])

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,reliance industries said statement 98 workers ...,relianc industri ha said statement 98 worker r...,reliance industry ha said statement 98 worker ...
1,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,amazon monday denied speculations looking acce...,amazon monday deni specul wa look accept bitco...,amazon monday denied speculation wa looking ac...
2,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,tesla ' billionaire ceo elon musk criticised a...,tesla ' billionair ceo elon musk criticis appl...,tesla ' billionaire ceo elon musk criticised a...
3,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,tesla ceo world ' secondrichest person elon mu...,tesla ceo world ' secondrichest person elon mu...,tesla ceo world ' secondrichest person elon mu...
4,Govt may lower import duty on EVs if Tesla man...,The government is open to consider reducing im...,government open consider reducing import duty ...,govern open consid reduc import duti offer inc...,government open consider reducing import duty ...
...,...,...,...,...,...
143,UN panel uses $600mn in Iraqi funds to pay Kuw...,A UN commission on Tuesday used $600 million i...,un commission tuesday used 600 million collect...,un commiss tuesday use 600 million collect ira...,un commission tuesday used 600 million collect...
144,1st person charged under Hong Kong national se...,The first person to be tried under the nationa...,first person tried national security law hong ...,first person tri nation secur law hong kong wa...,first person tried national security law hong ...
145,"Former US Senator Barbara Boxer assaulted, rob...",Former US Senator Barbara Boxer was assaulted ...,former us senator barbara boxer assaulted robb...,former us senat barbara boxer wa assault rob o...,former u senator barbara boxer wa assaulted ro...
146,"Gunmen on motorbikes attack village in Niger, ...",Armed men on motorbikes have killed at least 1...,armed men motorbikes killed least 14 civilians...,arm men motorbik kill least 14 civilian wound ...,armed men motorbike killed least 14 civilian w...


In [24]:
prep_article_data(codeup_df, 'content', extra_words=[], exclude_words=[])

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,rumors true time arrived codeup officially ope...,rumor true time ha arriv codeup ha offici open...,rumor true time ha arrived codeup ha officiall...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,dimitri antoniou maggie giust data science big...,dimitri antoni maggi giust data scienc big dat...,dimitri antoniou maggie giust data science big...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",dimitri antoniou week ago codeup launched imme...,dimitri antoni week ago codeup launch immers d...,dimitri antoniou week ago codeup launched imme...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair third biannual san antonio te...,sa tech job fair third biannual san antonio te...,sa tech job fair third biannual san antonio te...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps closing model danger prog...,competitor bootcamp close model danger program...,competitor bootcamps closing model danger prog...


### 9.) Ask yourself:
 - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - I would lemmatize it so that the words that are returned are real words. The dataset is small, so I don't see a waste of resources doing this method over stemming.
<br>
   
 - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - I would still lemmatize it...25MB isn't too bad of a size. It could be worse.
 - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    - Stemmed, I'll work with what I get before I have to pay lots of $ 

### Testing the Prepare Functions

---

In [25]:
article = news_df.content[0]

In [26]:
news_df_clean = prepare.basic_clean(article)
news_df_clean

'reliance industries has said in a statement that over 98 of its workers have received at least one dose of covid19 vaccine so far the billionaire mukesh ambaniled conglomerate had over 236 lakh employees of march 31 besides reliance hindustan unilever has also given at least one shot to 90 of employees while infosys inoculated 59 employees and tcs 70'

In [27]:
prepare.prep_article_data(codeup_df, 'content', extra_words=[], exclude_words=[])

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,rumors true time arrived codeup officially ope...,rumor true time arriv codeup offici open appli...,rumor true time arrived codeup officially open...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,dimitri antoniou maggie giust data science big...,dimitri antoni maggi giust data scienc big dat...,dimitri antoniou maggie giust data science big...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",dimitri antoniou week ago codeup launched imme...,dimitri antoni week ago codeup launch immers d...,dimitri antoniou week ago codeup launched imme...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair third biannual san antonio te...,sa tech job fair third biannual san antonio te...,sa tech job fair third biannual san antonio te...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps closing model danger prog...,competitor bootcamp close model danger program...,competitor bootcamps closing model danger prog...
