# Crawler

In [1]:
import requests
from bs4 import BeautifulSoup

def get_wikipedia_body(name):
    # Construct the Wikipedia URL for the given name
    url = f"https://en.wikipedia.org/wiki/{name}"

    # Send an HTTP request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the main content of the Wikipedia page
        content_div = soup.find('div', {'id': 'mw-content-text'})

        # Extract and return the text content within the main content div
        if content_div:
            paragraphs = content_div.find_all('p')  # Extract all paragraphs
            body_text = "\n".join([paragraph.get_text() for paragraph in paragraphs])
            return body_text
        else:
            return f"No content found for {name} on Wikipedia."
    else:
        return f"Failed to retrieve page for {name}. Status code: {response.status_code}"

# Example usage
name_to_search = "Albert_Einstein"
result = get_wikipedia_body(name_to_search)

print(result)




Albert Einstein (/ˈaɪnstaɪn/ EYEN-styne;[4] German: [ˈalbɛɐt ˈʔaɪnʃtaɪn] ⓘ; 14 March 1879 – 18 April 1955) was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, Einstein also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century.[1][5] His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation".[6] He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect",[7] a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science.[8][9] In a 1999 poll of 130 lea

In [2]:
def save_to_file(content, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

In [3]:
filename = "TextMining.txt"
save_to_file(result,filename)

# Preprocess

In [4]:
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
def preprocess_text(text):
    # Remove punctuation
    def remove_punctuation(text):
        punctuation_free = "".join([" " if char in string.punctuation else char for char in text])
        return punctuation_free

    # Convert to lowercase
    text = text.lower()
    cleaned_text=text
    # Remove stopwords
    stop_words_list = set(stopwords.words('english'))
    words = text.split()
    meaningful_words = [word for word in words if word.lower() not in stop_words_list]
    text = " ".join(meaningful_words)
    no_stop_text=text
    # Apply stemming
    porter_stemmer = PorterStemmer()
    words = text.split()
    stem_text = [porter_stemmer.stem(word) for word in words]
    text = " ".join(stem_text)
    stemmed_text=text
    # Apply lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    words = no_stop_text.split()
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in words]
    text = " ".join(lemm_text)

    return cleaned_text, no_stop_text, stemmed_text, text

# Example usage
original_text = "This is an example sentence with punctuation, stopwords, and various words."

cleaned_text, no_stop_text, stemmed_text,preprocessed_text = preprocess_text(original_text)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
print("Original Text:\n", original_text)
print("Cleaned Text Text:\n", cleaned_text)
print("Without Stop Word Text:\n", no_stop_text)
print("Stemmed Text:\n", stemmed_text)
print("After Lemmatizer Text:\n", preprocessed_text)

Original Text:
 This is an example sentence with punctuation, stopwords, and various words.
Cleaned Text Text:
 this is an example sentence with punctuation, stopwords, and various words.
Without Stop Word Text:
 example sentence punctuation, stopwords, various words.
Stemmed Text:
 exampl sentenc punctuation, stopwords, variou words.
After Lemmatizer Text:
 example sentence punctuation, stopwords, various words.


In [6]:
long_example_text = """
Natural language processing (NLP) is a subfield of artificial intelligence that focuses on the interaction between computers and humans using natural language. NLP techniques aim to enable computers to understand, interpret, and generate human-like text. It involves several tasks, including text preprocessing, tokenization, stemming, and lemmatization.

Tokenization is the process of breaking down a text into individual words or tokens. For example, the sentence "The quick brown fox jumps over the lazy dog" would be tokenized into ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"].

Stemming is the process of reducing words to their root or base form. For instance, the word "running" would be stemmed to "run." Stemming may not always result in a valid word, but it helps in grouping related words.

Lemmatization is a more advanced form of word normalization that aims to reduce words to their base or dictionary form, known as the lemma. For example, the word "better" would be lemmatized to "good." Unlike stemming, lemmatization ensures that the resulting word is a valid one.

In this example, we'll showcase the lemmatization process. Let's take the sentence "The cats are running and jumping around." After lemmatization, it would become "The cat be run and jump around."

Natural language processing is a fascinating field with numerous applications, from chatbots and sentiment analysis to machine translation and information retrieval.
"""

cleaned_text_long, no_stop_text_long, stemmed_text_long,preprocessed_text_long = preprocess_text(long_example_text)


In [7]:
print("Original Text:\n", long_example_text)
print('********************')
print("Cleaned Text Text:\n", cleaned_text_long)
print('********************')
print("Without Stop Word Text:\n", no_stop_text_long)
print('********************')
print("Stemmed Text:\n", stemmed_text_long)
print('********************')
print("After Lemmatizer Text:\n", preprocessed_text_long)

Original Text:
 
Natural language processing (NLP) is a subfield of artificial intelligence that focuses on the interaction between computers and humans using natural language. NLP techniques aim to enable computers to understand, interpret, and generate human-like text. It involves several tasks, including text preprocessing, tokenization, stemming, and lemmatization.

Tokenization is the process of breaking down a text into individual words or tokens. For example, the sentence "The quick brown fox jumps over the lazy dog" would be tokenized into ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"].

Stemming is the process of reducing words to their root or base form. For instance, the word "running" would be stemmed to "run." Stemming may not always result in a valid word, but it helps in grouping related words.

Lemmatization is a more advanced form of word normalization that aims to reduce words to their base or dictionary form, known as the lemma. For example, 

# Tokenizer

In [8]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Download the punkt tokenizer data if not already downloaded

def tokenize_text(text):
    # Use nltk's word_tokenize to tokenize the text
    tokens = word_tokenize(text)
    return tokens



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
tokenized_text = tokenize_text(preprocessed_text)
print("Tokenized Text:", tokenized_text)


Tokenized Text: ['example', 'sentence', 'punctuation', ',', 'stopwords', ',', 'various', 'words', '.']


In [10]:
tokenized_text = tokenize_text(preprocessed_text_long)
print("Tokenized Text:", preprocessed_text_long)


Tokenized Text: natural language processing (nlp) subfield artificial intelligence focus interaction computer human using natural language. nlp technique aim enable computer understand, interpret, generate human-like text. involves several tasks, including text preprocessing, tokenization, stemming, lemmatization. tokenization process breaking text individual word tokens. example, sentence "the quick brown fox jump lazy dog" would tokenized ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]. stemming process reducing word root base form. instance, word "running" would stemmed "run." stemming may always result valid word, help grouping related words. lemmatization advanced form word normalization aim reduce word base dictionary form, known lemma. example, word "better" would lemmatized "good." unlike stemming, lemmatization ensures resulting word valid one. example, we'll showcase lemmatization process. let's take sentence "the cat running jumping around." lemmatiza

# Parse Tree

## On port

In [11]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')

def generate_parse_tree(text):
    doc = nlp(text)
    return doc

# Example usage
example_text = "This is an example sentence with a parse tree."

parse_tree = generate_parse_tree(example_text)

# Visualize the parse tree using spaCy's displacy module
# displacy.serve(parse_tree, style='dep')



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


## Display render

In [12]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')

def generate_parse_tree(text):
    doc = nlp(text)
    return doc

# Example usage
example_text = "This is an example sentence with a parse tree."

parse_tree = generate_parse_tree(example_text)

# Visualize the parse tree using spaCy's displacy module
displacy.render(parse_tree, style='dep', jupyter=True)


# POS tagging

In [13]:
import spacy

nlp = spacy.load('en_core_web_sm')

def pos_tagging(text):
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    return pos_tags

# Example usage
example_text = "This is an example sentence for POS tagging."

pos_tags = pos_tagging(example_text)
print(pos_tags)


[('This', 'PRON'), ('is', 'AUX'), ('an', 'DET'), ('example', 'NOUN'), ('sentence', 'NOUN'), ('for', 'ADP'), ('POS', 'PROPN'), ('tagging', 'NOUN'), ('.', 'PUNCT')]


# NER

In [14]:
import spacy

nlp = spacy.load('en_core_web_sm')

def named_entity_recognition(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Example usage
example_text = "Apple Inc. was founded by Steve Jobs in Cupertino. Elon Musk is the CEO of Tesla, Inc."

ner_entities = named_entity_recognition(example_text)
print(ner_entities)


[('Apple Inc.', 'ORG'), ('Steve Jobs', 'PERSON'), ('Cupertino', 'GPE'), ('Elon Musk', 'PERSON'), ('Tesla, Inc.', 'ORG')]


# Finding the synonym of each word

In [15]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

def get_synonyms(text):
    stop_words = set(stopwords.words('english'))

    def get_synonyms_for_word(word):
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonym = lemma.name().replace('_', ' ').lower()
                if synonym != word and synonym not in stop_words:
                    synonyms.append(synonym)
        return set(synonyms)

    tokens = word_tokenize(text.lower())
    synonyms_dict = {word: list(get_synonyms_for_word(word)) for word in tokens if word not in stop_words}

    return synonyms_dict

# Example usage
example_text = "This is an example sentence with various words."

synonyms_result = get_synonyms(example_text)
for word, synonyms in synonyms_result.items():
    print(f"Synonyms for '{word}': {synonyms}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Synonyms for 'example': ['exercise', 'illustration', 'lesson', 'case', 'deterrent example', 'model', 'representative', 'good example', 'instance', 'object lesson', 'exemplar']
Synonyms for 'sentence': ['time', 'doom', 'condemn', 'prison term', 'judgment of conviction', 'conviction', 'condemnation']
Synonyms for 'various': ['versatile', 'several', 'diverse', 'assorted', 'respective']
Synonyms for 'words': ['articulate', 'quarrel', 'news', 'password', 'give voice', 'lyric', 'phrase', 'christian bible', 'run-in', 'word', 'discussion', 'speech', 'son', 'language', 'logos', 'dustup', 'countersign', 'formulate', 'holy writ', 'row', 'book', 'parole', 'word of honor', 'tidings', 'bible', 'intelligence', "actor's line", 'scripture', 'word of god', 'wrangle', 'good book', 'give-and-take', 'holy scripture', 'watchword']
Synonyms for '.': []


# Finding hypernym and hyponym relations in word sequences

In [16]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

def get_relations(text):
    stop_words = set(stopwords.words('english'))

    def get_relations_for_word(word):
        relations = {
            'synonyms': [],
            'hypernyms': [],
            'hyponyms': []
        }

        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonym = lemma.name().replace('_', ' ').lower()
                if synonym != word and synonym not in stop_words:
                    relations['synonyms'].append(synonym)

            for hypernym in syn.hypernyms():
                relations['hypernyms'].extend(hypernym.lemma_names())

            for hyponym in syn.hyponyms():
                relations['hyponyms'].extend(hyponym.lemma_names())

        return relations

    tokens = word_tokenize(text.lower())
    relations_dict = {word: get_relations_for_word(word) for word in tokens if word not in stop_words}

    return relations_dict

# Example usage
example_text = "This is an example sentence with various words."

relations_result = get_relations(example_text)
for word, relations in relations_result.items():
    print(f"Relations for '{word}':")
    print(f"  Synonyms: {relations['synonyms']}")
    print(f"  Hypernyms: {relations['hypernyms']}")
    print(f"  Hyponyms: {relations['hyponyms']}")
    print()


Relations for 'example':
  Synonyms: ['illustration', 'instance', 'representative', 'model', 'exemplar', 'model', 'good example', 'deterrent example', 'lesson', 'object lesson', 'case', 'instance', 'exercise']
  Hyponyms: ['apology', 'excuse', 'exception', 'precedent', 'case_in_point', 'quintessence', 'sample', 'specimen', 'lodestar', 'loadstar', 'microcosm', 'original', 'archetype', 'pilot', 'prefiguration', 'prototype', 'paradigm', 'epitome', 'image', 'template', 'templet', 'guide', 'type_specimen', 'holotype', 'beauty', 'beaut', 'pacesetter', 'pacemaker', 'pattern', 'prodigy', 'humiliation', 'mortification', 'piece', 'bit', 'time', 'clip']

Relations for 'sentence':
  Synonyms: ['conviction', 'judgment of conviction', 'condemnation', 'prison term', 'time', 'condemn', 'doom']
  Hypernyms: ['string_of_words', 'word_string', 'linguistic_string', 'final_judgment', 'final_decision', 'term', 'declare']
  Hyponyms: ['complex_sentence', 'compound_sentence', 'declarative_sentence', 'declarat

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#  Finding the semantic distance between two words

In [17]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
def semantic_distance(text):
    stop_words = set(stopwords.words('english'))

    def get_wordnet_pos(tag):
        if tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('R'):
            return wordnet.ADV
        elif tag.startswith('J'):
            return wordnet.ADJ
        else:
            return None

    def shortest_path_length(word1, word2):
        synsets1 = wordnet.synsets(word1)
        synsets2 = wordnet.synsets(word2)

        shortest_distance = float('inf')

        for synset1 in synsets1:
            for synset2 in synsets2:
                path_length = synset1.shortest_path_distance(synset2)
                if path_length is not None and path_length < shortest_distance:
                    shortest_distance = path_length

        return shortest_distance

    tokens = word_tokenize(text.lower())
    pos_tags = nltk.pos_tag(tokens)
    pos_tags = [(word, pos) for word, pos in pos_tags if word not in stop_words]

    distances_dict = {}

    for i in range(len(pos_tags) - 1):
        for j in range(i + 1, len(pos_tags)):
            word1, pos1 = pos_tags[i]
            word2, pos2 = pos_tags[j]

            wordnet_pos1 = get_wordnet_pos(pos1)
            wordnet_pos2 = get_wordnet_pos(pos2)

            if wordnet_pos1 is not None and wordnet_pos2 is not None:
                distance = shortest_path_length(word1, word2)
                distances_dict[(word1, word2)] = distance

    return distances_dict

# Example usage
example_text = "This is an example sentence with various words."

distances_result = semantic_distance(example_text)
for (word1, word2), distance in distances_result.items():
    print(f"Semantic distance between '{word1}' and '{word2}': {distance}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Semantic distance between 'example' and 'sentence': 7
Semantic distance between 'example' and 'various': inf
Semantic distance between 'example' and 'words': 6
Semantic distance between 'sentence' and 'various': inf
Semantic distance between 'sentence' and 'words': 3
Semantic distance between 'various' and 'words': inf


In [None]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

def semantic_distance_between_words(word1, word2, text):
    stop_words = set(stopwords.words('english'))

    def get_wordnet_pos(tag):
        if tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('R'):
            return wordnet.ADV
        elif tag.startswith('J'):
            return wordnet.ADJ
        else:
            return None

    def shortest_path_length(word1, word2):
      synsets1 = wordnet.synsets(word1)
      synsets2 = wordnet.synsets(word2)

      shortest_distance = float('inf')

      for synset1 in synsets1:
          for synset2 in synsets2:
              path_length = synset1.shortest_path_distance(synset2)
              if path_length is not None and path_length < shortest_distance:
                  shortest_distance = path_length

      if shortest_distance == float('inf'):
          return None
      else:
          return shortest_distance

    tokens = word_tokenize(text.lower())
    pos_tags = nltk.pos_tag(tokens)
    pos_tags = [(word, pos) for word, pos in pos_tags if word not in stop_words]

    wordnet_pos1 = get_wordnet_pos(pos_tags[0][1])
    wordnet_pos2 = get_wordnet_pos(pos_tags[1][1])

    if wordnet_pos1 is not None and wordnet_pos2 is not None:
        distance = shortest_path_length(word1, word2)
        return distance
    else:
        return None

# Example usage
word1 = "example"
word2 = "sentence"
example_text = "This is an example sentence with various words."

distance_result = semantic_distance_between_words(word1, word2, example_text)
print(f"Semantic distance between '{word1}' and '{word2}': {distance_result}")


# Compute similarity between two words

In [18]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

def get_wordnet_pos(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('J'):
        return wordnet.ADJ
    else:
        return None

def word_similarity(word1, word2):
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)

    max_similarity = 0.0

    for synset1 in synsets1:
        for synset2 in synsets2:
            similarity = synset1.wup_similarity(synset2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity

    return max_similarity

def semantic_similarity(text):
    stop_words = set(stopwords.words('english'))

    tokens = word_tokenize(text.lower())
    pos_tags = nltk.pos_tag(tokens)
    pos_tags = [(word, pos) for word, pos in pos_tags if word not in stop_words]

    similarities_dict = {}

    for i in range(len(pos_tags) - 1):
        for j in range(i + 1, len(pos_tags)):
            word1, pos1 = pos_tags[i]
            word2, pos2 = pos_tags[j]

            wordnet_pos1 = get_wordnet_pos(pos1)
            wordnet_pos2 = get_wordnet_pos(pos2)

            if wordnet_pos1 is not None and wordnet_pos2 is not None:
                similarity = word_similarity(word1, word2)
                similarities_dict[(word1, word2)] = similarity

    return similarities_dict

# Example usage
example_text = "This is an example sentence with various words."

similarities_result = semantic_similarity(example_text)
for (word1, word2), similarity in similarities_result.items():
    print(f"Semantic similarity between '{word1}' and '{word2}': {similarity}")


Semantic similarity between 'example' and 'sentence': 0.5333333333333333
Semantic similarity between 'example' and 'various': 0.2222222222222222
Semantic similarity between 'example' and 'words': 0.5714285714285714
Semantic similarity between 'sentence' and 'various': 0.2857142857142857
Semantic similarity between 'sentence' and 'words': 0.7272727272727273
Semantic similarity between 'various' and 'words': 0.25


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

def get_wordnet_pos(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('J'):
        return wordnet.ADJ
    else:
        return None

def word_similarity_between_words(word1, word2, text):
    stop_words = set(stopwords.words('english'))

    def word_similarity(word1, word2):
        synsets1 = wordnet.synsets(word1)
        synsets2 = wordnet.synsets(word2)

        max_similarity = 0.0

        for synset1 in synsets1:
            for synset2 in synsets2:
                similarity = synset1.wup_similarity(synset2)
                if similarity is not None and similarity > max_similarity:
                    max_similarity = similarity

        return max_similarity

    tokens = word_tokenize(text.lower())
    pos_tags = nltk.pos_tag(tokens)
    pos_tags = [(word, pos) for word, pos in pos_tags if word not in stop_words]

    wordnet_pos1 = get_wordnet_pos(pos_tags[0][1])
    wordnet_pos2 = get_wordnet_pos(pos_tags[1][1])

    if wordnet_pos1 is not None and wordnet_pos2 is not None:
        similarity = word_similarity(word1, word2)
        return similarity
    else:
        return None

# Example usage
word1 = "example"
word2 = "sentence"
example_text = "This is an example sentence with various words."

similarity_result = word_similarity_between_words(word1, word2, example_text)
print(f"Semantic similarity between '{word1}' and '{word2}': {similarity_result}")


Semantic similarity between 'example' and 'sentence': 0.5333333333333333


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# apply on file

## read file

In [22]:
# def save_to_file(content, filename):
#     with open(filename, 'w', encoding='utf-8') as file:
#         file.write(content)
# filename = "TextMining.txt"
# save_to_file(result,filename)
text=result

## preprocessing

In [23]:
cleaned_text, no_stop_text, stemmed_text,preprocessed_text = preprocess_text(text)


In [24]:
print("Original Text:\n", text[:100])
print('********************')
print("Cleaned Text Text:\n", cleaned_text[:100])
print('********************')
print("Without Stop Word Text:\n", no_stop_text[:100])
print('********************')
print("Stemmed Text:\n", stemmed_text[:100])
print('********************')
print("After Lemmatizer Text:\n", preprocessed_text[:100])

Original Text:
 

Albert Einstein (/ˈaɪnstaɪn/ EYEN-styne;[4] German: [ˈalbɛɐt ˈʔaɪnʃtaɪn] ⓘ; 14 March 1879 – 18 Apr
********************
Cleaned Text Text:
 

albert einstein (/ˈaɪnstaɪn/ eyen-styne;[4] german: [ˈalbɛɐt ˈʔaɪnʃtaɪn] ⓘ; 14 march 1879 – 18 apr
********************
Without Stop Word Text:
 albert einstein (/ˈaɪnstaɪn/ eyen-styne;[4] german: [ˈalbɛɐt ˈʔaɪnʃtaɪn] ⓘ; 14 march 1879 – 18 april
********************
Stemmed Text:
 albert einstein (/ˈaɪnstaɪn/ eyen-styne;[4] german: [ˈalbɛɐt ˈʔaɪnʃtaɪn] ⓘ; 14 march 1879 – 18 april
********************
After Lemmatizer Text:
 albert einstein (/ˈaɪnstaɪn/ eyen-styne;[4] german: [ˈalbɛɐt ˈʔaɪnʃtaɪn] ⓘ; 14 march 1879 – 18 april


## tokenizing

In [25]:
tokenized_text = tokenize_text(preprocessed_text)
print("Tokenized Text:", tokenized_text[:100])


Tokenized Text: ['albert', 'einstein', '(', '/ˈaɪnstaɪn/', 'eyen-styne', ';', '[', '4', ']', 'german', ':', '[', 'ˈalbɛɐt', 'ˈʔaɪnʃtaɪn', ']', 'ⓘ', ';', '14', 'march', '1879', '–', '18', 'april', '1955', ')', 'german-born', 'theoretical', 'physicist', 'widely', 'held', 'one', 'greatest', 'influential', 'scientist', 'time', '.', 'best', 'known', 'developing', 'theory', 'relativity', ',', 'einstein', 'also', 'made', 'important', 'contribution', 'quantum', 'mechanics', ',', 'thus', 'central', 'figure', 'revolutionary', 'reshaping', 'scientific', 'understanding', 'nature', 'modern', 'physic', 'accomplished', 'first', 'decade', 'twentieth', 'century', '.', '[', '1', ']', '[', '5', ']', 'mass–energy', 'equivalence', 'formula', 'e', '=', 'mc2', ',', 'arises', 'relativity', 'theory', ',', 'called', '``', 'the', 'world', "'s", 'famous', 'equation', "''", '.', '[', '6', ']', 'received', '1921', 'nobel', 'prize', 'physic']


## parse tree

In [None]:
parse_tree = generate_parse_tree(preprocessed_text)


### on port

In [27]:
# Visualize the parse tree using spaCy's displacy module
# displacy.serve(parse_tree, style='dep')

### on jupyter

In [28]:
# Visualize the parse tree using spaCy's displacy module
displacy.render(parse_tree, style='dep', jupyter=True)


## POS tagging

In [29]:
pos_tags = pos_tagging(text)
print(pos_tags)



## NER

In [31]:
ner_entities = named_entity_recognition(text)
print(ner_entities)

[('Albert Einstein', 'PERSON'), ('German', 'NORP'), ('14 March 1879', 'DATE'), ('18 April 1955', 'DATE'), ('German', 'NORP'), ('Einstein', 'PERSON'), ('the first decades of the', 'DATE'), ('twentieth', 'ORDINAL'), ('century.[1][5', 'ORG'), ('1921', 'DATE'), ('Nobel Prize in Physics', 'WORK_OF_ART'), ('1999', 'DATE'), ('130', 'CARDINAL'), ('British', 'NORP'), ('Physics World', 'ORG'), ('Einstein', 'PERSON'), ('Einstein', 'PERSON'), ('1905', 'DATE'), ('mirabilis', 'PERSON'), ('miracle year', 'DATE'), ('Einstein', 'PERSON'), ('four', 'CARDINAL'), ('Brownian', 'NORP'), ('1915', 'DATE'), ('Einstein', 'PERSON'), ('two', 'CARDINAL'), ('Firstly', 'ORDINAL'), ('quantum', 'ORG'), ('Secondly', 'ORDINAL'), ('the German Empire', 'GPE'), ('Einstein', 'PERSON'), ('Switzerland', 'GPE'), ('1895', 'DATE'), ('German', 'NORP'), ('the Kingdom of Württemberg)[note', 'GPE'), ('1', 'CARDINAL'), ('1897', 'DATE'), ('the age of seventeen', 'DATE'), ('the Swiss Federal polytechnic school', 'ORG'), ('Zürich', 'GPE

## Finding the synonym of each word

In [32]:
synonyms_result = get_synonyms(text)
for word, synonyms in synonyms_result.items():
    print(f"Synonyms for '{word}': {synonyms}")


Synonyms for 'albert': ['prince albert', 'albert francis charles augustus emmanuel']
Synonyms for 'einstein': ['brain', 'albert einstein', 'mastermind', 'genius', 'brainiac']
Synonyms for '(': []
Synonyms for '/ˈaɪnstaɪn/': []
Synonyms for 'eyen-styne': []
Synonyms for ';': []
Synonyms for '[': []
Synonyms for '4': ['quatern', 'quaternity', 'tetrad', 'quadruplet', 'quartet', 'iv', 'quaternion', 'foursome', 'little joe', 'quaternary', 'four']
Synonyms for ']': []
Synonyms for 'german': ['high german', 'german language']
Synonyms for ':': []
Synonyms for 'ˈalbɛɐt': []
Synonyms for 'ˈʔaɪnʃtaɪn': []
Synonyms for 'ⓘ': []
Synonyms for '14': ['xiv', 'fourteen']
Synonyms for 'march': ['demonstrate', 'marchland', 'adjoin', 'abut', 'butt against', 'borderland', 'marching', 'exhibit', 'border', 'border district', 'butt', 'edge', 'parade', 'mar', 'process', 'butt on', 'master of architecture', 'marching music']
Synonyms for '1879': []
Synonyms for '–': []
Synonyms for '18': ['xviii', 'eighteen']
S

## Finding hypernym and hyponym relations in word sequences

In [33]:
relations_result = get_relations(text)
for word, relations in relations_result.items():
    print(f"Relations for '{word}':")
    print(f"  Synonyms: {relations['synonyms']}")
    print(f"  Hypernyms: {relations['hypernyms']}")
    print(f"  Hyponyms: {relations['hyponyms']}")
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Relations for 'willingly':
  Synonyms: ['volitionally']
  Hypernyms: []
  Hyponyms: []

Relations for 'taught':
  Synonyms: ['teach', 'learn', 'instruct', 'teach']
  Hypernyms: ['inform', 'habituate', 'accustom']
  Hyponyms: ['catechize', 'catechise', 'coach', 'train', 'condition', 'drill', 'enlighten', 'edify', 'ground', 'indoctrinate', 'induct', 'lecture', 'talk', 'mentor', 'reinforce', 'reward', 'spoonfeed', 'train', 'develop', 'prepare', 'educate', 'tutor', 'unteach', 'unteach']

Relations for 'practicing':
  Synonyms: ['practice', 'practise', 'exercise', 'drill', 'exercise', 'practice', 'practise', 'rehearse', 'practise', 'practice', 'practice', 'apply', 'use', 'commit', 'practice']
  Hypernyms: ['learn', 'study', 'read', 'take', 'perform', 'execute', 'do', 'prosecute', 'engage', 'pursue']
  Hyponyms: ['shamanize', 'shamanise', 'scrimmage', 'walk_through', 'follow']

Relations for 'systematically':
  Synonyms: ['cons

##  Finding the semantic distance between two words

In [56]:
word1 = "theory"
word2 = "relativity"

semantic_distance_result = semantic_distance_between_words(word1, word2, text)
print(f"Semantic distance between '{word1}' and '{word2}': {semantic_distance_result}")


Semantic distance between 'theory' and 'relativity': 2


In [57]:
word1 = "albert"
word2 = "einstein"

semantic_distance_result = semantic_distance_between_words(word1, word2, text)
print(f"Semantic distance between '{word1}' and '{word2}': {semantic_distance_result}")


Semantic distance between 'albert' and 'einstein': 7


In [60]:
word1 = "albert"
word2 = "physics"

semantic_distance_result = semantic_distance_between_words(word1, word2, text)
print(f"Semantic distance between '{word1}' and '{word2}': {semantic_distance_result}")


Semantic distance between 'albert' and 'physics': 10


In [58]:
word1 = "albert"
word2 = "islam"

semantic_distance_result = semantic_distance_between_words(word1, word2, text)
print(f"Semantic distance between '{word1}' and '{word2}': {semantic_distance_result}")


Semantic distance between 'albert' and 'islam': 14


## Compute similarity between two words

In [74]:
word1 = "theory"
word2 = "relativity"

result = word_similarity_between_words(word1, word2, text)
print(f"Semantic similarity between  '{word1}' and '{word2}': {result}")


Semantic similarity between  'theory' and 'relativity': 0.9


In [73]:
word1 = "albert"
word2 = "einstein"

result = word_similarity_between_words(word1, word2, text)
print(f"Semantic similarity between  '{word1}' and '{word2}': {result}")


Semantic similarity between  'albert' and 'einstein': 0.5714285714285714


In [72]:
word1 = "albert"
word2 = "physics"

result = word_similarity_between_words(word1, word2, text)
print(f"Semantic similarity between  '{word1}' and '{word2}': {result}")


Semantic similarity between  'albert' and 'physics': 0.375


In [71]:
word1 = "albert"
word2 = "islam"

result = word_similarity_between_words(word1, word2, text)
print(f"Semantic similarity between  '{word1}' and '{word2}': {result}")


Semantic similarity between  'albert' and 'islam': 0.125
