In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as a
import os
import acquire_news_articles as n

**1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:**

Lowercase everything

Normalize unicode characters

Replace anything that is not a letter, number, whitespace or a single quote. 

In [24]:
some_string = "I'm gonna soak up the Sun. I'm gonna tell everyone to lighten up!"

In [25]:
def lower(some_string):
    some_string = some_string.lower()
    return some_string

In [26]:
lower(some_string)

"i'm gonna soak up the sun. i'm gonna tell everyone to lighten up!"

In [3]:
def normalize(some_string):
    some_string = unicodedata.normalize('NFKD', some_string).encode('ascii', 'ignore').decode('utf-8')
    some_string = re.sub(r'[^a-zA-Z0-9\'\s]', '', some_string)
    return some_string

In [27]:
normalize(some_string)

"I'm gonna soak up the Sun I'm gonna tell everyone to lighten up"

In [29]:
def basic_clean(some_string):
    some_string = lower(some_string)
    some_string = normalize(some_string)
    return some_string

In [30]:
basic_clean(some_string)

"i'm gonna soak up the sun i'm gonna tell everyone to lighten up"

In [5]:
def tokenize(some_string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(some_string, return_str=True)

In [31]:
tokenize(some_string)

"I ' m gonna soak up the Sun. I ' m gonna tell everyone to lighten up !"

In [39]:
def lemmatize(some_string):
    lemmatizer = nltk.WordNetLemmatizer()
    return ' '.join(
        [lemmatizer.lemmatize(word) for word in some_string.split()])

In [40]:
lemmatize(some_string)

"I'm gonna soak up the Sun. I'm gonna tell everyone to lighten up!"

In [42]:
def stem(some_string):
    stemmer = nltk.porter.PorterStemmer()
    return ' '.join(
        [stemmer.stem(word) for word in some_string.split()])

In [43]:
stem(some_string)

"i'm gonna soak up the sun. i'm gonna tell everyon to lighten up!"

In [44]:
def remove_stopwords(some_string, extra_words=[], keep_words=[]):
    stopwords_custom = set(stopwords.words('english')) - \
    set(keep_words)
    stopwords_custom = list(stopwords_custom.union(extra_words))
    return ' '.join([word for word in some_string.split()
                     if word not in stopwords_custom])

In [45]:
remove_stopwords(some_string)

"I'm gonna soak Sun. I'm gonna tell everyone lighten up!"

In [47]:
def transform_data(df, extra_words=[], keep_words=[]):
    df = df.rename(columns={'content': 'original'})
    df['clean'] = df['original'].apply(basic_clean).apply(
        tokenize).apply(remove_stopwords)
    df['stemmed'] = df['clean'].apply(stem)
    df['lemmatized'] = df['stemmed'].apply(lemmatize)
    return df

In [10]:
news_df = a.get_inshorts_df()

In [11]:
news_df

Unnamed: 0,title,content,category
0,Yashasvi Jaiswal's reaction to Virat Kohli's p...,RR opener Yashasvi Jaiswal's reaction to Virat...,sports
1,"I don't sledge someone upfront, not my habit: ...","LSG fast bowler Naveen-ul-Haq, in an interacti...",sports
2,"Table-toppers GT to bowl first against MI, bot...",Table-toppers Gujarat Titans (GT) captain Hard...,sports
3,Jos Buttler fined 10% of match fee for breachi...,RR opener Jos Buttler has been fined 10% of hi...,sports
4,KL Rahul shares 'Doff the cap' GIF for Yashasv...,KL Rahul took to Twitter to share a 'Doff the ...,sports
...,...,...,...
20,"The girl went through hell, it needs to be tol...",After Asaram Bapu sent a legal notice to maker...,entertainment
21,Team of 'The Kerala Story' to attend BJP's 'ya...,The team of 'The Kerala Story' movie will be a...,entertainment
22,"This baby is planned, says Robert De Niro on 7...","Actor Robert De Niro, who recently announced t...",entertainment
23,"I'm busy, don't have time for 'Come, let's gos...",Actress Kavita Kaushik has said that she has b...,entertainment


In [12]:
codeup_df = a.get_blog_df()

In [13]:
codeup_df

Unnamed: 0,title,content
0,Women in tech: Panelist Spotlight – Magdalena ...,\nCodeup is hosting a Women in Tech Panel in h...
1,Women in tech: Panelist Spotlight – Rachel Rob...,\nCodeup is hosting a Women in Tech Panel in h...
2,Women in Tech: Panelist Spotlight – Sarah Mellor,\nCodeup is hosting a Women in Tech Panel in ...
3,Women in Tech: Panelist Spotlight – Madeleine ...,\nCodeup is hosting a Women in Tech Panel in h...
4,Black Excellence in Tech: Panelist Spotlight –...,\n\nCodeup is hosting a Black Excellence in Te...
5,Black excellence in tech: Panelist Spotlight –...,\nCodeup is hosting our second Black Excellenc...


**8. For each dataframe, produce the following columns:**

title to hold the title

original to hold the original article/post content

clean to hold the normalized and tokenized original with the stopwords removed.

stemmed to hold the stemmed version of the cleaned data.

lemmatized to hold the lemmatized version of the cleaned data.

In [48]:
codeup_cleaned = transform_data(codeup_df)
codeup_cleaned

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Women in tech: Panelist Spotlight – Magdalena ...,\nCodeup is hosting a Women in Tech Panel in h...,codeup hosting women tech panel honor womens h...,codeup host women tech panel honor women histo...,codeup host woman tech panel honor woman histo...
1,Women in tech: Panelist Spotlight – Rachel Rob...,\nCodeup is hosting a Women in Tech Panel in h...,codeup hosting women tech panel honor womens h...,codeup host women tech panel honor women histo...,codeup host woman tech panel honor woman histo...
2,Women in Tech: Panelist Spotlight – Sarah Mellor,\nCodeup is hosting a Women in Tech Panel in ...,codeup hosting women tech panel honor womens h...,codeup host women tech panel honor women histo...,codeup host woman tech panel honor woman histo...
3,Women in Tech: Panelist Spotlight – Madeleine ...,\nCodeup is hosting a Women in Tech Panel in h...,codeup hosting women tech panel honor womens h...,codeup host women tech panel honor women histo...,codeup host woman tech panel honor woman histo...
4,Black Excellence in Tech: Panelist Spotlight –...,\n\nCodeup is hosting a Black Excellence in Te...,codeup hosting black excellence tech panel hon...,codeup host black excel tech panel honor black...,codeup host black excel tech panel honor black...
5,Black excellence in tech: Panelist Spotlight –...,\nCodeup is hosting our second Black Excellenc...,codeup hosting second black excellence tech pa...,codeup host second black excel tech panel hono...,codeup host second black excel tech panel hono...


In [51]:
news_cleaned = transform_data(news_df)
news_cleaned

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,Yashasvi Jaiswal's reaction to Virat Kohli's p...,RR opener Yashasvi Jaiswal's reaction to Virat...,sports,rr opener yashasvi jaiswal ' reaction virat ko...,rr open yashasvi jaiswal ' reaction virat kohl...,rr open yashasvi jaiswal ' reaction virat kohl...
1,"I don't sledge someone upfront, not my habit: ...","LSG fast bowler Naveen-ul-Haq, in an interacti...",sports,lsg fast bowler naveenulhaq interaction teamma...,lsg fast bowler naveenulhaq interact teammat a...,lsg fast bowler naveenulhaq interact teammat a...
2,"Table-toppers GT to bowl first against MI, bot...",Table-toppers Gujarat Titans (GT) captain Hard...,sports,tabletoppers gujarat titans gt captain hardik ...,tabletopp gujarat titan gt captain hardik pand...,tabletopp gujarat titan gt captain hardik pand...
3,Jos Buttler fined 10% of match fee for breachi...,RR opener Jos Buttler has been fined 10% of hi...,sports,rr opener jos buttler fined 10 match fee breac...,rr open jo buttler fine 10 match fee breach ip...,rr open jo buttler fine 10 match fee breach ip...
4,KL Rahul shares 'Doff the cap' GIF for Yashasv...,KL Rahul took to Twitter to share a 'Doff the ...,sports,kl rahul took twitter share ' doff cap ' gif m...,kl rahul took twitter share ' doff cap ' gif m...,kl rahul took twitter share ' doff cap ' gif m...
...,...,...,...,...,...,...
20,"The girl went through hell, it needs to be tol...",After Asaram Bapu sent a legal notice to maker...,entertainment,asaram bapu sent legal notice makers ' sirf ek...,asaram bapu sent legal notic maker ' sirf ek b...,asaram bapu sent legal notic maker ' sirf ek b...
21,Team of 'The Kerala Story' to attend BJP's 'ya...,The team of 'The Kerala Story' movie will be a...,entertainment,team ' kerala story ' movie attending bjp ' ' ...,team ' kerala stori ' movi attend bjp ' ' hind...,team ' kerala stori ' movi attend bjp ' ' hind...
22,"This baby is planned, says Robert De Niro on 7...","Actor Robert De Niro, who recently announced t...",entertainment,actor robert de niro recently announced birth ...,actor robert de niro recent announc birth seve...,actor robert de niro recent announc birth seve...
23,"I'm busy, don't have time for 'Come, let's gos...",Actress Kavita Kaushik has said that she has b...,entertainment,actress kavita kaushik said busy creatively ac...,actress kavita kaushik said busi creativ activ...,actress kavita kaushik said busi creativ activ...
