In [65]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Natural_language_processing"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

content_div = soup.find('div', {'id': 'mw-content-text'})
paragraphs = content_div.find_all('p', recursive=True)
print(paragraphs)

[<p><b>Natural language processing</b> (NLP) is the processing of <a href="/wiki/Natural_language" title="Natural language">natural language</a> information by a <a href="/wiki/Computer" title="Computer">computer</a>. The study of NLP, a subfield of <a href="/wiki/Computer_science" title="Computer science">computer science</a>, is generally associated with <a href="/wiki/Artificial_intelligence" title="Artificial intelligence">artificial intelligence</a>. NLP is related to <a href="/wiki/Information_retrieval" title="Information retrieval">information retrieval</a>, <a class="mw-redirect" href="/wiki/Knowledge_representation" title="Knowledge representation">knowledge representation</a>, <a href="/wiki/Computational_linguistics" title="Computational linguistics">computational linguistics</a>, and more broadly with <a href="/wiki/Linguistics" title="Linguistics">linguistics</a>.<sup class="reference" id="cite_ref-nlpintro_1-0"><a href="#cite_note-nlpintro-1"><span class="cite-bracket">[

# **Extract Text from HTML**

In [66]:
raw_text = "\n".join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
print(raw_text)

Natural language processing (NLP) is the processing of natural language information by a computer. The study of NLP, a subfield of computer science, is generally associated with artificial intelligence. NLP is related to information retrieval, knowledge representation, computational linguistics, and more broadly with linguistics.[1]
Major processing tasks in an NLP system include: speech recognition, text classification, natural language understanding, and natural language generation.
Natural language processing has its roots in the 1950s.[2] Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that involves the automated interpretation and generation of natural language.
The premise of symbolic NLP is well-summarized by John Searle's 

# **Tokenize**

In [67]:
import nltk
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
tokens = word_tokenize(raw_text.lower(), language='english')
print(f"Number of Tokens: {len(tokens)}\n\n")
print(tokens)

Number of Tokens: 1069


['natural', 'language', 'processing', '(', 'nlp', ')', 'is', 'the', 'processing', 'of', 'natural', 'language', 'information', 'by', 'a', 'computer', '.', 'the', 'study', 'of', 'nlp', ',', 'a', 'subfield', 'of', 'computer', 'science', ',', 'is', 'generally', 'associated', 'with', 'artificial', 'intelligence', '.', 'nlp', 'is', 'related', 'to', 'information', 'retrieval', ',', 'knowledge', 'representation', ',', 'computational', 'linguistics', ',', 'and', 'more', 'broadly', 'with', 'linguistics', '.', '[', '1', ']', 'major', 'processing', 'tasks', 'in', 'an', 'nlp', 'system', 'include', ':', 'speech', 'recognition', ',', 'text', 'classification', ',', 'natural', 'language', 'understanding', ',', 'and', 'natural', 'language', 'generation', '.', 'natural', 'language', 'processing', 'has', 'its', 'roots', 'in', 'the', '1950s', '.', '[', '2', ']', 'already', 'in', '1950', ',', 'alan', 'turing', 'published', 'an', 'article', 'titled', '``', 'computing', 'machinery', '

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Remove **Punctuation**

In [68]:
import string
tokens_no_punct = [t for t in tokens if t not in string.punctuation]
print(f"Number of Tokens without punctuation: {len(tokens_no_punct)}\n\n")
print(tokens_no_punct)

Number of Tokens without punctuation: 896


['natural', 'language', 'processing', 'nlp', 'is', 'the', 'processing', 'of', 'natural', 'language', 'information', 'by', 'a', 'computer', 'the', 'study', 'of', 'nlp', 'a', 'subfield', 'of', 'computer', 'science', 'is', 'generally', 'associated', 'with', 'artificial', 'intelligence', 'nlp', 'is', 'related', 'to', 'information', 'retrieval', 'knowledge', 'representation', 'computational', 'linguistics', 'and', 'more', 'broadly', 'with', 'linguistics', '1', 'major', 'processing', 'tasks', 'in', 'an', 'nlp', 'system', 'include', 'speech', 'recognition', 'text', 'classification', 'natural', 'language', 'understanding', 'and', 'natural', 'language', 'generation', 'natural', 'language', 'processing', 'has', 'its', 'roots', 'in', 'the', '1950s', '2', 'already', 'in', '1950', 'alan', 'turing', 'published', 'an', 'article', 'titled', '``', 'computing', 'machinery', 'and', 'intelligence', "''", 'which', 'proposed', 'what', 'is', 'now', 'called', 'the',

# **Remove Stopwords**

In [69]:
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens_no_stop = [t for t in tokens_no_punct if t not in stop_words]

print(f"Number of Tokens without punctuation & stopwords: {len(tokens_no_stop)}\n\n")
print(tokens_no_stop)

Number of Tokens without punctuation & stopwords: 577


['natural', 'language', 'processing', 'nlp', 'processing', 'natural', 'language', 'information', 'computer', 'study', 'nlp', 'subfield', 'computer', 'science', 'generally', 'associated', 'artificial', 'intelligence', 'nlp', 'related', 'information', 'retrieval', 'knowledge', 'representation', 'computational', 'linguistics', 'broadly', 'linguistics', '1', 'major', 'processing', 'tasks', 'nlp', 'system', 'include', 'speech', 'recognition', 'text', 'classification', 'natural', 'language', 'understanding', 'natural', 'language', 'generation', 'natural', 'language', 'processing', 'roots', '1950s', '2', 'already', '1950', 'alan', 'turing', 'published', 'article', 'titled', '``', 'computing', 'machinery', 'intelligence', "''", 'proposed', 'called', 'turing', 'test', 'criterion', 'intelligence', 'though', 'time', 'articulated', 'problem', 'separate', 'artificial', 'intelligence', 'proposed', 'test', 'includes', 'task', 'involves', 'automa

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Lemmatize**

In [70]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens_no_stop]

print(f"Number of Tokens after lemmatizing: {len(lemmatized_tokens)}\n\n")
print(lemmatized_tokens)
print("\n\n")

for i in range(len(lemmatized_tokens)):
  if lemmatized_tokens[i] != tokens_no_stop[i]:
    print(f"{tokens_no_stop[i]} -> {lemmatized_tokens[i]}")

Number of Tokens after lemmatizing: 577


['natural', 'language', 'processing', 'nlp', 'processing', 'natural', 'language', 'information', 'computer', 'study', 'nlp', 'subfield', 'computer', 'science', 'generally', 'associated', 'artificial', 'intelligence', 'nlp', 'related', 'information', 'retrieval', 'knowledge', 'representation', 'computational', 'linguistics', 'broadly', 'linguistics', '1', 'major', 'processing', 'task', 'nlp', 'system', 'include', 'speech', 'recognition', 'text', 'classification', 'natural', 'language', 'understanding', 'natural', 'language', 'generation', 'natural', 'language', 'processing', 'root', '1950s', '2', 'already', '1950', 'alan', 'turing', 'published', 'article', 'titled', '``', 'computing', 'machinery', 'intelligence', "''", 'proposed', 'called', 'turing', 'test', 'criterion', 'intelligence', 'though', 'time', 'articulated', 'problem', 'separate', 'artificial', 'intelligence', 'proposed', 'test', 'includes', 'task', 'involves', 'automated', 'interpret

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Computer TTR**

In [71]:
unique_tokens = set(lemmatized_tokens)
ttr = len(unique_tokens) / len(lemmatized_tokens)

print(f"Number of Unique Tokens: {len(unique_tokens)}\n\n")
print(unique_tokens)
print("\n\n")
print(f"Type-Token Ratio: {ttr}")

Number of Unique Tokens: 359


{'tie', 'articulated', 'room', '1950', '1980s', '49', '24', 'announced', 'build', 'major', 'proposed', 'coupled', 'symbolic', 'psychology', 'although', 'explicit', 'well-summarized', 'theory', 'ended', 'needed', 'recognition', 'includes', 'applying', 'advanced', 'general', 'drawback', 'include', 'method', '51', 'i.e.', 'involve', 'direction', 'chomskyan', 'j.', 'tree', 'cognitive', 'historically', 'steady', '21', 'new', 'context', 'necessary', '50', 'e.g.', 'london', 'two', 'lookup', '1950s', 'obsolete', 'likewise', 'coarse', 'perspective', 'parsing', 'technical', '22', 'advantage', 'artificial', 'first', 'representation', 'task', 'conference', 'replaced', 'called', 'inefficiency', '2', 'hand', 'embeddings', 'rule-based', 'corpus', 'lessening', 'root', 'rarely', 'developmental', 'part-of-speech', 'earliest', 'rule', '19', 'mainstream', 'machine-learning', 'act-r', 'e.g', 'automated', 'along', 'generation', 'towards', 'neural', 'test', 'collection', 'highe

# **Brown and Gutenber**

In [72]:
nltk.download('brown')
nltk.download('gutenberg')
from nltk.corpus import brown, gutenberg

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [73]:
print("Brown Corpus TTR Results:")
for genre in ['news', 'romance', 'science_fiction']:
    words = brown.words(categories=genre)
    tokens = [w.lower() for w in words if w.isalpha()]
    unique = set(tokens)
    ttr = len(unique) / len(tokens)
    print(f"  {genre:<15} TTR: {ttr:.4f}")

Brown Corpus TTR Results:
  news            TTR: 0.1334
  romance         TTR: 0.1287
  science_fiction TTR: 0.2440


In [74]:
print("Gutenberg Corpus TTR Results:")
for fileid in ['austen-emma.txt', 'bible-kjv.txt', 'shakespeare-hamlet.txt']:
    words = gutenberg.words(fileid)
    tokens = [w.lower() for w in words if w.isalpha()]
    unique = set(tokens)
    ttr = len(unique) / len(tokens)
    print(f"  {fileid:<25} TTR: {ttr:.4f}")

Gutenberg Corpus TTR Results:
  austen-emma.txt           TTR: 0.0438
  bible-kjv.txt             TTR: 0.0159
  shakespeare-hamlet.txt    TTR: 0.1553


In [75]:
print("end")

end
