In [1]:
import requests
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:

import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [3]:
url = "https://en.wikipedia.org/wiki/Independence_Day_(India)"
response = requests.get(url)
html_content = response.content

soup = BeautifulSoup(html_content, 'html.parser')

# Extract the Data

Once you have parsed HTML, extract the relevant text data using various methods such as find(), find_all() and get_text()

In [4]:
# Example : Extracting all paragraphs
paragraphs = soup.find_all('p')

# Extracting text from each paragraph
paragraph_texts = [paragraph.get_text() for paragraph in paragraphs]
paragraph_texts

['\n',
 "Independence Day is celebrated annually on 15 August as a public holiday in India commemorating the nation's independence from the United Kingdom on 15 August 1947, the day when the provisions of the Indian Independence Act, which transferred legislative sovereignty to the Indian Constituent Assembly, came into effect. India retained King George\xa0VI as head of state until its transition to a republic, when the Constitution of India came into effect on 26 January 1950 (celebrated as Indian Republic Day) and replaced the dominion prefix, Dominion of India, with the enactment of the sovereign law Constitution of India. India attained independence following the independence movement noted for largely non-violent resistance and civil disobedience led by Indian National Congress under the leadership of Mahatma Gandhi who adopted these values from one of the early movements[1] in India led by Ram Singh Kuka (quoted in one of the letters by Shaheed Bhagat Singh referring Guru Ram Si

In [5]:
paragraph_texts = []
for i in paragraphs:
  p = i.get_text()
  paragraph_texts.append(p)
paragraph_texts

['\n',
 "Independence Day is celebrated annually on 15 August as a public holiday in India commemorating the nation's independence from the United Kingdom on 15 August 1947, the day when the provisions of the Indian Independence Act, which transferred legislative sovereignty to the Indian Constituent Assembly, came into effect. India retained King George\xa0VI as head of state until its transition to a republic, when the Constitution of India came into effect on 26 January 1950 (celebrated as Indian Republic Day) and replaced the dominion prefix, Dominion of India, with the enactment of the sovereign law Constitution of India. India attained independence following the independence movement noted for largely non-violent resistance and civil disobedience led by Indian National Congress under the leadership of Mahatma Gandhi who adopted these values from one of the early movements[1] in India led by Ram Singh Kuka (quoted in one of the letters by Shaheed Bhagat Singh referring Guru Ram Si

# Text Preprocessing

Text preprocessing involves various steps to clean and normalize the extracted text

In [6]:
lowercase_text = []
for i in paragraph_texts:
  lowercase_text.append(i.lower())
lowercase_text

['\n',
 "independence day is celebrated annually on 15 august as a public holiday in india commemorating the nation's independence from the united kingdom on 15 august 1947, the day when the provisions of the indian independence act, which transferred legislative sovereignty to the indian constituent assembly, came into effect. india retained king george\xa0vi as head of state until its transition to a republic, when the constitution of india came into effect on 26 january 1950 (celebrated as indian republic day) and replaced the dominion prefix, dominion of india, with the enactment of the sovereign law constitution of india. india attained independence following the independence movement noted for largely non-violent resistance and civil disobedience led by indian national congress under the leadership of mahatma gandhi who adopted these values from one of the early movements[1] in india led by ram singh kuka (quoted in one of the letters by shaheed bhagat singh referring guru ram si

In [7]:
# Convert to lowercase

lowercase_text = [text.lower() for text in paragraph_texts]
lowercase_text

['\n',
 "independence day is celebrated annually on 15 august as a public holiday in india commemorating the nation's independence from the united kingdom on 15 august 1947, the day when the provisions of the indian independence act, which transferred legislative sovereignty to the indian constituent assembly, came into effect. india retained king george\xa0vi as head of state until its transition to a republic, when the constitution of india came into effect on 26 january 1950 (celebrated as indian republic day) and replaced the dominion prefix, dominion of india, with the enactment of the sovereign law constitution of india. india attained independence following the independence movement noted for largely non-violent resistance and civil disobedience led by indian national congress under the leadership of mahatma gandhi who adopted these values from one of the early movements[1] in india led by ram singh kuka (quoted in one of the letters by shaheed bhagat singh referring guru ram si

In [8]:
# Remove special characters using regex

cleaned_text = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in lowercase_text]
cleaned_text

['\n',
 'independence day is celebrated annually on 15 august as a public holiday in india commemorating the nations independence from the united kingdom on 15 august 1947 the day when the provisions of the indian independence act which transferred legislative sovereignty to the indian constituent assembly came into effect india retained king george\xa0vi as head of state until its transition to a republic when the constitution of india came into effect on 26 january 1950 celebrated as indian republic day and replaced the dominion prefix dominion of india with the enactment of the sovereign law constitution of india india attained independence following the independence movement noted for largely nonviolent resistance and civil disobedience led by indian national congress under the leadership of mahatma gandhi who adopted these values from one of the early movements1 in india led by ram singh kuka quoted in one of the letters by shaheed bhagat singh referring guru ram singh as his dada

In [10]:
nltk.download('punkt')
tokenized_text = []
for i in cleaned_text:
  tokenized_text.append(word_tokenize(i))
tokenized_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[[],
 ['independence',
  'day',
  'is',
  'celebrated',
  'annually',
  'on',
  '15',
  'august',
  'as',
  'a',
  'public',
  'holiday',
  'in',
  'india',
  'commemorating',
  'the',
  'nations',
  'independence',
  'from',
  'the',
  'united',
  'kingdom',
  'on',
  '15',
  'august',
  '1947',
  'the',
  'day',
  'when',
  'the',
  'provisions',
  'of',
  'the',
  'indian',
  'independence',
  'act',
  'which',
  'transferred',
  'legislative',
  'sovereignty',
  'to',
  'the',
  'indian',
  'constituent',
  'assembly',
  'came',
  'into',
  'effect',
  'india',
  'retained',
  'king',
  'george',
  'vi',
  'as',
  'head',
  'of',
  'state',
  'until',
  'its',
  'transition',
  'to',
  'a',
  'republic',
  'when',
  'the',
  'constitution',
  'of',
  'india',
  'came',
  'into',
  'effect',
  'on',
  '26',
  'january',
  '1950',
  'celebrated',
  'as',
  'indian',
  'republic',
  'day',
  'and',
  'replaced',
  'the',
  'dominion',
  'prefix',
  'dominion',
  'of',
  'india',
  'wi

In [11]:
# Tokenization
nltk.download('punkt')

tokenized_text = [word_tokenize(text) for text in cleaned_text]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
tokenized_text

[[],
 ['independence',
  'day',
  'is',
  'celebrated',
  'annually',
  'on',
  '15',
  'august',
  'as',
  'a',
  'public',
  'holiday',
  'in',
  'india',
  'commemorating',
  'the',
  'nations',
  'independence',
  'from',
  'the',
  'united',
  'kingdom',
  'on',
  '15',
  'august',
  '1947',
  'the',
  'day',
  'when',
  'the',
  'provisions',
  'of',
  'the',
  'indian',
  'independence',
  'act',
  'which',
  'transferred',
  'legislative',
  'sovereignty',
  'to',
  'the',
  'indian',
  'constituent',
  'assembly',
  'came',
  'into',
  'effect',
  'india',
  'retained',
  'king',
  'george',
  'vi',
  'as',
  'head',
  'of',
  'state',
  'until',
  'its',
  'transition',
  'to',
  'a',
  'republic',
  'when',
  'the',
  'constitution',
  'of',
  'india',
  'came',
  'into',
  'effect',
  'on',
  '26',
  'january',
  '1950',
  'celebrated',
  'as',
  'indian',
  'republic',
  'day',
  'and',
  'replaced',
  'the',
  'dominion',
  'prefix',
  'dominion',
  'of',
  'india',
  'wi

In [13]:
# Stopword Removal
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_text = [[word for word in tokens if word not in stop_words] for tokens in tokenized_text]
filtered_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[[],
 ['independence',
  'day',
  'celebrated',
  'annually',
  '15',
  'august',
  'public',
  'holiday',
  'india',
  'commemorating',
  'nations',
  'independence',
  'united',
  'kingdom',
  '15',
  'august',
  '1947',
  'day',
  'provisions',
  'indian',
  'independence',
  'act',
  'transferred',
  'legislative',
  'sovereignty',
  'indian',
  'constituent',
  'assembly',
  'came',
  'effect',
  'india',
  'retained',
  'king',
  'george',
  'vi',
  'head',
  'state',
  'transition',
  'republic',
  'constitution',
  'india',
  'came',
  'effect',
  '26',
  'january',
  '1950',
  'celebrated',
  'indian',
  'republic',
  'day',
  'replaced',
  'dominion',
  'prefix',
  'dominion',
  'india',
  'enactment',
  'sovereign',
  'law',
  'constitution',
  'india',
  'india',
  'attained',
  'independence',
  'following',
  'independence',
  'movement',
  'noted',
  'largely',
  'nonviolent',
  'resistance',
  'civil',
  'disobedience',
  'led',
  'indian',
  'national',
  'congress',
 

In [14]:
# Stemming

stemmer = PorterStemmer()
stemmed_text = [[stemmer.stem(word) for word in tokens] for tokens in filtered_text]
stemmed_text

[[],
 ['independ',
  'day',
  'celebr',
  'annual',
  '15',
  'august',
  'public',
  'holiday',
  'india',
  'commemor',
  'nation',
  'independ',
  'unit',
  'kingdom',
  '15',
  'august',
  '1947',
  'day',
  'provis',
  'indian',
  'independ',
  'act',
  'transfer',
  'legisl',
  'sovereignti',
  'indian',
  'constitu',
  'assembl',
  'came',
  'effect',
  'india',
  'retain',
  'king',
  'georg',
  'vi',
  'head',
  'state',
  'transit',
  'republ',
  'constitut',
  'india',
  'came',
  'effect',
  '26',
  'januari',
  '1950',
  'celebr',
  'indian',
  'republ',
  'day',
  'replac',
  'dominion',
  'prefix',
  'dominion',
  'india',
  'enact',
  'sovereign',
  'law',
  'constitut',
  'india',
  'india',
  'attain',
  'independ',
  'follow',
  'independ',
  'movement',
  'note',
  'larg',
  'nonviol',
  'resist',
  'civil',
  'disobedi',
  'led',
  'indian',
  'nation',
  'congress',
  'leadership',
  'mahatma',
  'gandhi',
  'adopt',
  'valu',
  'one',
  'earli',
  'movements1',
 

In [15]:
# Remove empty tokens

final_text = [[word for word in tokens if word.strip()] for tokens in stemmed_text]

# Convert tokens back to sentences
sentences = [' '.join(tokens) for tokens in final_text]

# Convert sentences back to paragraphs
processed_paragraphs = '\n\n'.join(sentences)

In [16]:
processed_paragraphs

'\n\nindepend day celebr annual 15 august public holiday india commemor nation independ unit kingdom 15 august 1947 day provis indian independ act transfer legisl sovereignti indian constitu assembl came effect india retain king georg vi head state transit republ constitut india came effect 26 januari 1950 celebr indian republ day replac dominion prefix dominion india enact sovereign law constitut india india attain independ follow independ movement note larg nonviol resist civil disobedi led indian nation congress leadership mahatma gandhi adopt valu one earli movements1 india led ram singh kuka quot one letter shahe bhagat singh refer guru ram singh dada guru\n\nindepend coincid partit india2 british india divid dominion india pakistan partit accompani violent riot mass casualti displac nearli 15 million peopl due religi violenc 15 august 1947 first prime minist india jawaharl nehru rais indian nation flag lahori gate red fort delhi subsequ independ day incumb prime minist customaril

# Save Processed Text

In [17]:
with open('processed_text.txt', 'w', encoding='utf-8') as file:
  file.write(processed_paragraphs)