In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

### 1.) Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

 - Lowercase everything
 - normalize unicode characters
 - replace anything that is not a letter, number, whitespace, or a single quote

In [2]:
def basic_clean(stringcheese):
    stringcheese = stringcheese.lower()
    stringcheese = unicodedata.normalize('NFKD', stringcheese)\
        .encode('ascii','ignore')\
        .decode('utf-8')
    stringcheese = re.sub(r"[^a-z0-9'\s]", '', stringcheese)
    return stringcheese

In [3]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [4]:
clean_original = basic_clean(original)
clean_original

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### 2.) Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [5]:
def tokenize(stringcheese):
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()

    # Use the tokenizer
    stringcheese = tokenizer.tokenize(stringcheese, return_str=True)

    return stringcheese

In [6]:
tokenize(clean_original)

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [7]:
tokenized = tokenize(clean_original)

### 3.) Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [8]:
def stem(stringcheese):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in stringcheese.split()]
    stems = ' '.join(stems)
    return stems

In [9]:
stemmed = stem(tokenized)
stemmed

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### 4.) Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [10]:
def lemmatize(stringcheese):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in stringcheese.split()]
    lemmas = ' '.join(lemmas)
    return lemmas

In [19]:
lemmatized = lemmatize(stemmed)
lemmatized

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written a erdo or erdo either by mistak or out of typograph necess"

### 5.) Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [22]:
def remove_stopwords(words, extra_words = [], exclude_words = []):
    stopword_list = stopwords.words('english')
    
    stopword_list = set(stopword_list) - set(exclude_words)
    stopword_list = stopword_list.union(set(extra_words))
    
    words = words.split()
    filtered_words = [word for word in words if word not in stopword_list]
    article_without_stopwords = ' '.join(filtered_words)
    return article_without_stopwords

In [23]:
remove_stopwords(lemmatized)

"paul erdo georg polya influenti hungarian mathematician contribut lot field erdo ' name contain hungarian letter ' ' ' ' doubl acut accent often incorrectli written erdo erdo either mistak typograph necess"

### 6.) Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [14]:
categories = ["business", "sports", "technology", "entertainment", "science", "world"]
news_df = acquire.get_all_news_articles(categories)

In [15]:
news_df

Unnamed: 0,title,content,category
0,China's ex-teacher turned billionaire no more ...,"China's Larry Chen, a former teacher who becam...",business
1,Amazon job posting fuels speculations about pl...,A new job posting by Amazon has fuelled specul...,business
2,"Musk takes a jibe at rival car companies, says...",Tesla CEO and the world's second-richest perso...,business
3,Govt paid Infosys ₹164.5 crore for new Income ...,The government paid ₹164.5 crore to Infosys to...,business
4,Mahua Moitra writes to FM to look into 'over-i...,Lok Sabha MP Mahua Moitra has shared a letter ...,business
...,...,...,...
142,Ugandan govt spends $30 mn on cars for lawmake...,The Ugandan government was criticised after it...,world
143,Equatorial Guinea to close UK embassy over san...,Equatorial Guinea's Foreign Minister said that...,world
144,Lebanese lawmakers pick billionaire Najib Mika...,Lebanese lawmakers during parliamentary consul...,world
145,Man accused of trying to kill Mali's interim P...,A man accused of trying to kill Mali's interim...,world


### 7.) Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [16]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/', 
        'https://codeup.com/data-science-myths/', 
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/', 
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/', 
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

In [17]:
codeup_df = pd.DataFrame(acquire.get_blog_articles(urls))

In [18]:
codeup_df

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


### 8.) For each dataframe, produce the following columns:
 - title to hold the title
 - original to hold the original article/post content
 - clean to hold the normalized and tokenized original with the stopwords removed.
 - stemmed to hold the stemmed version of the cleaned data.
 - lemmatized to hold the lemmatized version of the cleaned data.

In [30]:
def prep_article_data(df, content, extra_words=[], exclude_words=[]):
    df['title'] = df.title
    df['original']= df.content   
    df['clean'] = df[content].apply(basic_clean)\
                    .apply(tokenize)\
                    .apply(remove_stopwords)
    
    df['stemmed']= df[content].apply(basic_clean)\
                    .apply(tokenize)\
                    .apply(stem)\
                    .apply(remove_stopwords,extra_words = extra_words, exclude_words = exclude_words)
    
    df['lemmatized'] = df[content].apply(basic_clean)\
                    .apply(tokenize)\
                    .apply(lemmatize)\
                    .apply(remove_stopwords,extra_words = extra_words, exclude_words = exclude_words)
    return df[['title', 'original','clean','stemmed','lemmatized']] 

In [31]:
prep_article_data(news_df, 'content', extra_words=[], exclude_words=[])

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,China's ex-teacher turned billionaire no more ...,"China's Larry Chen, a former teacher who becam...",china ' larry chen former teacher became billi...,china ' larri chen former teacher becam billio...,china ' larry chen former teacher became billi...
1,Amazon job posting fuels speculations about pl...,A new job posting by Amazon has fuelled specul...,new job posting amazon fuelled speculations ec...,new job post amazon ha fuell specul ecommerc m...,new job posting amazon ha fuelled speculation ...
2,"Musk takes a jibe at rival car companies, says...",Tesla CEO and the world's second-richest perso...,tesla ceo world ' secondrichest person elon mu...,tesla ceo world ' secondrichest person elon mu...,tesla ceo world ' secondrichest person elon mu...
3,Govt paid Infosys ₹164.5 crore for new Income ...,The government paid ₹164.5 crore to Infosys to...,government paid 1645 crore infosys build new i...,govern paid 1645 crore infosi build new incom ...,government paid 1645 crore infosys build new i...
4,Mahua Moitra writes to FM to look into 'over-i...,Lok Sabha MP Mahua Moitra has shared a letter ...,lok sabha mp mahua moitra shared letter wrote ...,lok sabha mp mahua moitra ha share letter wrot...,lok sabha mp mahua moitra ha shared letter wro...
...,...,...,...,...,...
142,Ugandan govt spends $30 mn on cars for lawmake...,The Ugandan government was criticised after it...,ugandan government criticised spent 302 millio...,ugandan govern wa criticis spent 302 million t...,ugandan government wa criticised spent 302 mil...
143,Equatorial Guinea to close UK embassy over san...,Equatorial Guinea's Foreign Minister said that...,equatorial guinea ' foreign minister said coun...,equatori guinea ' foreign minist said countri ...,equatorial guinea ' foreign minister said coun...
144,Lebanese lawmakers pick billionaire Najib Mika...,Lebanese lawmakers during parliamentary consul...,lebanese lawmakers parliamentary consultations...,lebanes lawmak dure parliamentari consult mond...,lebanese lawmaker parliamentary consultation m...
145,Man accused of trying to kill Mali's interim P...,A man accused of trying to kill Mali's interim...,man accused trying kill mali ' interim preside...,man accus tri kill mali ' interim presid assim...,man accused trying kill mali ' interim preside...


In [32]:
prep_article_data(codeup_df, 'content', extra_words=[], exclude_words=[])

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,rumors true time arrived codeup officially ope...,rumor true time ha arriv codeup ha offici open...,rumor true time ha arrived codeup ha officiall...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,dimitri antoniou maggie giust data science big...,dimitri antoni maggi giust data scienc big dat...,dimitri antoniou maggie giust data science big...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",dimitri antoniou week ago codeup launched imme...,dimitri antoni week ago codeup launch immers d...,dimitri antoniou week ago codeup launched imme...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair third biannual san antonio te...,sa tech job fair third biannual san antonio te...,sa tech job fair third biannual san antonio te...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps closing model danger prog...,competitor bootcamp close model danger program...,competitor bootcamps closing model danger prog...


### 9.) Ask yourself:
 - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - I would lemmatize it so that the words that are returned are real words. The dataset is small, so I don't see a waste of resources doing this method over stemming.
<br>
   
 - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - I would still lemmatize it...25MB isn't too bad of a size. It could be worse.
 - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    - Stemmed, I'll work with what I get before I have to pay 