In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score
from sklearn.cluster import KMeans
import string
import numpy as np

Explore the Wikipedia page of any famous person Kamala Harris (https://en.wikipedia.org/wiki/Kamala_Harris)

a) First of all, scrape text from Wikipedia and save it in a text file

In [2]:
# define the URL of the Wikipedia page
kam = 'https://en.wikipedia.org/wiki/Kamala_Harris'

# get the HTML content of the web page
response = requests.get(kam)
html_content = response.content

# create a BeautifulSoup object from the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# find the tags that contain the text you want to scrape
text_tags = soup.find_all('p')

# extract the text from the tags and save it to a txt file
with open('kamalaharris_text.txt', 'w') as file:
    for tag in text_tags:
        file.write(tag.get_text())

b) Without any text pre-processing, generate the vocabulary (e.g. number of
keywords in your text file using Sklearn or any other text processing library of
your choice, such as NLTK, spacy, etc.)


In [3]:
with open('kamalaharris_text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

vectorizer = CountVectorizer()
vectorizer.fit_transform([text])

print('Number of keywords:', len(vectorizer.vocabulary_))


Number of keywords: 2948


c)

In [4]:
# using nltk
nltk.download('punkt')
nltk.download('stopwords')

# with open('kamalaharris_text.txt', 'r', encoding='utf-8') as file:
#     text = file.read()

tokens = nltk.word_tokenize(text)
fdist = FreqDist(tokens)

print('Number of keywords:', len(fdist))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Number of keywords: 3256


d) Next, clean the text data by performing the following operation one after another.

• Convert words to lowercase.

• Remove numbers.

• Punctuation Removal

• Tokenization

• Stop Word Removal

• Perform Stemming and Lemmatization (we can use any library of your
choice)


In [5]:
nltk.download('wordnet')
# Convert to lowercase
text = text.lower()

# Remove numbers
text = re.sub(r'\d+', '', text)

# Punctuation Removal
nlp = spacy.load('en_core_web_sm')
text = nlp(text)
text = " ".join(token.text for token in text if not token.is_punct)

# Tokenization
tokens = word_tokenize(text)

# Stop Word Removal
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if not token in stop_words]

# Perform Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

# Check and print frequency of pre-processed texts
fdist = FreqDist(lemmatized_tokens)

print('Number of keywords:', len(fdist))


[nltk_data] Downloading package wordnet to /root/nltk_data...


Number of keywords: 2294


e) Show the 10 most frequent words and 10 rearrest words.

In [6]:
# Show the 10 most frequent words
print("10 most frequent words:", fdist.most_common(10))

# Show the 10 rarest words
print("10 rarest words:", fdist.most_common()[:-11:-1])


10 most frequent words: [('harris', 243), ("'s", 106), ('california', 80), ('state', 56), ('attorney', 42), ('president', 42), ('first', 42), ('office', 39), ('$', 38), ('law', 34)]
10 rarest words: [('fiction', 1), ('written', 1), ('leadership', 1), ('head', 1), ('phenomenal', 1), ('founder', 1), ('meena', 1), ('niece', 1), ('brother', 1), ('analyst', 1)]


f) Generate a Bar graph (chart) of the top 30 most frequent words.

In [7]:
import plotly.graph_objects as go

# get the top 30 most frequent words
top_words = fdist.most_common(30)

# plot the bar chart
fig = go.Figure([go.Bar(x=[w[0] for w in top_words], y=[w[1] for w in top_words])])
fig.show()