In [1]:
import unicodedata
import re
import json
import os
from requests import get
from bs4 import BeautifulSoup
import acquire

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
from acquire import parse_blog
from acquire import get_article_text
from acquire import get_codeup_blogs
from acquire import get_inshorts_articles
from acquire import prep_text

In [None]:
# Store contents of 'spam.csv' into a variable
spam = pd.read_csv('spam.csv')

In [None]:
# Check column names.
spam.columns

In [None]:
# Drop unwanted columns.
spam.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)

In [None]:
# Rename columns.
spam.rename(columns={'v1':'label','v2':'text'}, inplace=True)

In [None]:
spam

In [None]:
# Add words to append to list of stopwords.
ADDITIONAL_STOPWORDS = ['r','u','2','ltgt']

# This function will append list of words to stopwords
def clean(text):
    # Create lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # Create list of stopwords
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    # Convert string to ASCII character set.
    text = (unicodedata.normalize('NFKD', text)
           .encode('ascii','ignore')
           .decode('utf-8','ignore')
           .lower())
    # Remove special characters.
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [None]:
# This function shows number of words and the ratio.
def show_counts_and_ratios(df, column):
    labels = pd.concat([spam.label.value_counts(),
                       spam.label.value_counts(normalize=True)], axis =1)
    labels.columns = ['n', 'percent']
    return labels

In [None]:
show_counts_and_ratios(spam, 'label')

In [None]:
# Convert dataframe into text.
ham_words = clean(' '.join(spam[spam.label == 'ham'].text))
spam_words = clean(' '.join(spam[spam.label == 'spam'].text))
all_words = clean(' '.join(spam.text))

In [None]:
# Check top 5 words in each list.
ham_words[:5], spam_words[:5], all_words[:5]

In [None]:
# Get value counts of each word in each list.
ham_freq = pd.Series(ham_words).value_counts()
spam_freq = pd.Series(spam_words).value_counts()
all_freq = pd.Series(all_words).value_counts()

spam_freq.head()

In [None]:
pd.concat([all_freq, ham_freq, spam_freq], axis=1, sort=True)

In [None]:
# Combine value counts of words into one dataframe to work on

word_counts = (pd.concat([all_freq, ham_freq, spam_freq], axis=1, sort=True)
              .set_axis(['all', 'ham', 'spam'], axis=1, inplace=False)
              .fillna(0) 
              .apply(lambda s: s.astype(int)))

word_counts

### Are there any words that uniquely identify a spam or ham message?

In [None]:
pd.concat([word_counts[word_counts.ham == 0].sort_values(by='spam', ascending=False).head(10),
          word_counts[word_counts.spam == 0].sort_values(by='ham', ascending=False).head(10)])

### What are the most frequently occuring words?

In [None]:
word_counts.sort_values(by='all', ascending=False)

## Visualization

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Plot the ratios of spam and ham words.
(word_counts
.assign(p_spam=word_counts.spam / word_counts['all'],
        p_ham=word_counts.ham / word_counts['all'])
.sort_values(by='all')
 [['p_spam', 'p_ham']]
 .tail(20)
 .sort_values('p_ham')
 .plot.barh(stacked=True))

plt.title('Proportion of Spam vs Ham for the 20most common words')

In [None]:
# Get values and ratios of all words that have a value count greater than 10.
(word_counts
[(word_counts.spam > 10) & (word_counts.ham > 10)]
.assign(ratio=lambda df: df.spam / (df.ham + .01))
.sort_values(by='ratio')
.pipe(lambda df: pd.concat([df.head(), df.tail()])))

In [None]:
from wordcloud import WordCloud

# Create sentence for wordcloud
sentence = "Mary had a little lamb, little lamb, little lamb. Its fleece was white as snow."

# Generate the image using previously stored sentence.
img = WordCloud(background_color='white').generate(sentence)

# Display image.
plt.imshow(img)
plt.axis('off')

In [None]:
# Generate images from joining all words in the list into one 'sentence' using '.join'
all_cloud = WordCloud(background_color='white', height=1000, width=400).generate(' '.join(all_words))
ham_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(ham_words))
spam_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(spam_words))

plt.figure(figsize=(10,8))
axs = [plt.axes([0,0,.5,1]), plt.axes([.5,.5,.5,.5]), plt.axes([.5,0,.5,.5])]

axs[0].imshow(all_cloud)
axs[1].imshow(ham_cloud)
axs[2].imshow(spam_cloud)

axs[0].set_title('All Words')
axs[1].set_title('Ham')
axs[2].set_title('Spam')

for ax in axs: ax.axis('off')

## Bigrams

In [None]:
sentence = "Mary had a little lamb"

bigrams = nltk.ngrams(sentence.split(), 2)
list(bigrams)

In [None]:
top_20_ham_bigrams = (pd.Series(nltk.ngrams(ham_words, 2))
                     .value_counts()
                     .head(20))

top_20_ham_bigrams

In [None]:
top_20_ham_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])

In [None]:
top_20_ham_bigrams.plot.barh(color='pink', width=.9, figsize=(10,6))

plt.title('20 Most Frequently Occuring Ham Bigrams')
plt.ylabel('Bigram')
plt.xlabel('Number of Occurances')

ticks , _ = plt.yticks()
labels = top_20_ham_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_ham_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8,4))
plt.imshow(img)
plt.axis('off')
plt.show()

## Create and explore bigrams for the spam data. Visualize them with a word cloud. How do they compare with the ham bigrams?

In [None]:
top_20_spam_bigrams = (pd.Series(nltk.ngrams(spam_words, 2))
                      .value_counts()
                      .head(20))

top_20_spam_bigrams

In [None]:
top_20_spam_bigrams.plot.barh(color='blue', width=.9, figsize=(10,6))

plt.title('20 Most Frequently Occuring Spam Bigrams')
plt.ylabel('Bigram')
plt.xlabel('Number of Occurances')

# Make the labels pretty
ticks, _ = plt.yticks()
labels = top_20_spam_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_spam_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8,4))
plt.imshow(img)
plt.axis('off')
plt.show()

## Create and explore with trigrams(i.e. an n-gram with an n of 3) for both the spam and ham data.

In [None]:
top_20_ham_trigrams = (pd.Series(nltk.ngrams(ham_words, 3))
                     .value_counts()
                     .head(20))

top_20_ham_trigrams

In [None]:
top_20_ham_trigrams.plot.barh(color='pink', width=.9, figsize=(10,6))

plt.title('20 Most Frequently Occuring Ham Trigrams')
plt.ylabel('Trigram')
plt.xlabel('Number of Occurances')

ticks, _ = plt.yticks()
labels = top_20_ham_trigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1] + ' ' + t[2])
_ = plt.yticks(ticks, labels)

In [None]:
data = {k[0] + ' ' + k[1] + ' ' + k[2]: v for k, v in top_20_ham_trigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8,4))
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
top_20_spam_trigrams = (pd.Series(nltk.ngrams(spam_words, 3))
                      .value_counts()
                      .head(20))

top_20_spam_trigrams

In [None]:
top_20_spam_trigrams.plot.barh(color='pink', width=.9, figsize=(10,6))

plt.title('20 Most Frequently Occuring Spam Trigrams')
plt.ylabel('Trigram')
plt.xlabel('Number of Occurances')

ticks, _ = plt.yticks()
labels = top_20_spam_trigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1] + ' ' + t[2])
_ = plt.yticks(ticks, labels)

In [None]:
data = {k[0] + ' ' + k[1] + ' ' + k[2]: v for k, v in top_20_spam_trigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8,4))
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
codeup_df = acquire.get_codeup_blogs(cached=True)

In [None]:
codeup_df

In [None]:
acquire.prep_text(codeup_df, 'original')

In [2]:
news_df = get_inshorts_articles()

In [3]:
news_df

Unnamed: 0,title,author,original,date,category
0,"Omicron BA.2 found in 57 countries, doesn't se...",Apaar Sharma,Omicron BA.2 variant has been found in 57 coun...,2022-02-02,science
1,"Indian scientists develop self-disinfecting, w...",Ridham Gambhir,The Ministry of Science and Technology on Frid...,2022-02-04,science
2,"Astronaut spends continuous 300 days in space,...",Ridham Gambhir,NASA has revealed that its astronaut Mark Vand...,2022-02-04,science
3,9.9 crore-year-old flowers found perfectly pre...,Ankush Verma,Two 9.9 crore-year-old flowers have been found...,2022-02-02,science
4,Picture of Mars crater that looks like a tree ...,Pragya Swastik,The European Space Agency (ESA) has released a...,2022-02-02,science
...,...,...,...,...,...
120,Kangana has been a very supportive & endearing...,Udit Gupta,"Nawazuddin Siddiqui, who has wrapped up Kangan...",2022-02-04,entertainment
121,"Riteish Deshmukh, Genelia to star in comedy fi...",Udit Gupta,Riteish Deshmukh and his actress-wife Genelia ...,2022-02-04,entertainment
122,2022 will be a busy year: Disha Patani on upco...,Ramanpreet Singh Virdi,Actress Disha Patani has said 2022 will be a b...,2022-02-04,entertainment
123,Dharma Productions in talks to remake their ol...,Udit Gupta,Karan Johar's Dharma Productions is currently ...,2022-02-04,entertainment


In [4]:
prep_text(news_df, 'original')

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,"Omicron BA.2 found in 57 countries, doesn't se...",Omicron BA.2 variant has been found in 57 coun...,omicron ba2 variant found 57 countries said tu...,omicron ba2 variant ha found 57 countri said t...,omicron ba2 variant ha found 57 country said t...
1,"Indian scientists develop self-disinfecting, w...",The Ministry of Science and Technology on Frid...,ministry science technology friday announced t...,ministri scienc technolog friday announc team ...,ministry science technology friday announced t...
2,"Astronaut spends continuous 300 days in space,...",NASA has revealed that its astronaut Mark Vand...,nasa revealed astronaut mark vande hei lived s...,nasa ha reveal astronaut mark vand hei ha live...,nasa ha revealed astronaut mark vande hei ha l...
3,9.9 crore-year-old flowers found perfectly pre...,Two 9.9 crore-year-old flowers have been found...,two 99 croreyearold flowers found perfectly pr...,two 99 croreyearold flower found perfectli pre...,two 99 croreyearold flower found perfectly pre...
4,Picture of Mars crater that looks like a tree ...,The European Space Agency (ESA) has released a...,european space agency esa released picture mar...,european space agenc esa ha releas pictur mar ...,european space agency esa ha released picture ...
...,...,...,...,...,...
120,Kangana has been a very supportive & endearing...,"Nawazuddin Siddiqui, who has wrapped up Kangan...",nawazuddin siddiqui wrapped kangana ranaut ' f...,nawazuddin siddiqui ha wrap kangana ranaut ' f...,nawazuddin siddiqui ha wrapped kangana ranaut ...
121,"Riteish Deshmukh, Genelia to star in comedy fi...",Riteish Deshmukh and his actress-wife Genelia ...,riteish deshmukh actresswife genelia deshmukh ...,riteish deshmukh hi actresswif genelia deshmuk...,riteish deshmukh actresswife genelia deshmukh ...
122,2022 will be a busy year: Disha Patani on upco...,Actress Disha Patani has said 2022 will be a b...,actress disha patani said 2022 busy year ek vi...,actress disha patani ha said 2022 busi year ek...,actress disha patani ha said 2022 busy year ek...
123,Dharma Productions in talks to remake their ol...,Karan Johar's Dharma Productions is currently ...,karan johar ' dharma productions currently tal...,karan johar ' dharma product current talk rema...,karan johar ' dharma production currently talk...


In [5]:
news_df

Unnamed: 0,title,author,original,date,category,clean,stemmed,lemmatized
0,"Omicron BA.2 found in 57 countries, doesn't se...",Apaar Sharma,Omicron BA.2 variant has been found in 57 coun...,2022-02-02,science,omicron ba2 variant found 57 countries said tu...,omicron ba2 variant ha found 57 countri said t...,omicron ba2 variant ha found 57 country said t...
1,"Indian scientists develop self-disinfecting, w...",Ridham Gambhir,The Ministry of Science and Technology on Frid...,2022-02-04,science,ministry science technology friday announced t...,ministri scienc technolog friday announc team ...,ministry science technology friday announced t...
2,"Astronaut spends continuous 300 days in space,...",Ridham Gambhir,NASA has revealed that its astronaut Mark Vand...,2022-02-04,science,nasa revealed astronaut mark vande hei lived s...,nasa ha reveal astronaut mark vand hei ha live...,nasa ha revealed astronaut mark vande hei ha l...
3,9.9 crore-year-old flowers found perfectly pre...,Ankush Verma,Two 9.9 crore-year-old flowers have been found...,2022-02-02,science,two 99 croreyearold flowers found perfectly pr...,two 99 croreyearold flower found perfectli pre...,two 99 croreyearold flower found perfectly pre...
4,Picture of Mars crater that looks like a tree ...,Pragya Swastik,The European Space Agency (ESA) has released a...,2022-02-02,science,european space agency esa released picture mar...,european space agenc esa ha releas pictur mar ...,european space agency esa ha released picture ...
...,...,...,...,...,...,...,...,...
120,Kangana has been a very supportive & endearing...,Udit Gupta,"Nawazuddin Siddiqui, who has wrapped up Kangan...",2022-02-04,entertainment,nawazuddin siddiqui wrapped kangana ranaut ' f...,nawazuddin siddiqui ha wrap kangana ranaut ' f...,nawazuddin siddiqui ha wrapped kangana ranaut ...
121,"Riteish Deshmukh, Genelia to star in comedy fi...",Udit Gupta,Riteish Deshmukh and his actress-wife Genelia ...,2022-02-04,entertainment,riteish deshmukh actresswife genelia deshmukh ...,riteish deshmukh hi actresswif genelia deshmuk...,riteish deshmukh actresswife genelia deshmukh ...
122,2022 will be a busy year: Disha Patani on upco...,Ramanpreet Singh Virdi,Actress Disha Patani has said 2022 will be a b...,2022-02-04,entertainment,actress disha patani said 2022 busy year ek vi...,actress disha patani ha said 2022 busi year ek...,actress disha patani ha said 2022 busy year ek...
123,Dharma Productions in talks to remake their ol...,Udit Gupta,Karan Johar's Dharma Productions is currently ...,2022-02-04,entertainment,karan johar ' dharma productions currently tal...,karan johar ' dharma product current talk rema...,karan johar ' dharma production currently talk...


## Create a separate dataframe for each category.