In [65]:
import nltk
from nltk import word_tokenize, pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alex-\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\alex-\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### 1. Gramatically correct sentence

In [68]:

# Original grammatically correct sentence

original_sentence = "The current state of the market reflects the high uncertainty from investors due to the latest macroeconomic events."

# Process the original sentence

tokens_original = word_tokenize(original_sentence)
pos_tags_original = pos_tag(tokens_original)

# Print the original sentence and its POS tags

print("Original Sentence:\n")
print(original_sentence)
print("\nPOS Tags for Original Sentence:\n")
for word, tag in pos_tags_original:
    print(f"{word}: {tag}")

Original Sentence:

The current state of the market reflects the high uncertainty from investors due to the latest macroeconomic events.

POS Tags for Original Sentence:

The: DT
current: JJ
state: NN
of: IN
the: DT
market: NN
reflects: VBZ
the: DT
high: JJ
uncertainty: NN
from: IN
investors: NNS
due: JJ
to: TO
the: DT
latest: JJS
macroeconomic: JJ
events: NNS
.: .


### 2. Sentence with spelling errors

In [71]:
# Introduce spelling errors

misspelled_sentence = "The curent staite of the markett reflekts the hihg uncertanty from investers due to the lateest macroenomic events."

# Process misspelled sentence

tokens_misspelled = word_tokenize(misspelled_sentence)
pos_tags_misspelled = pos_tag(tokens_misspelled)

# Print misspelled sentence and its POS tags

print("Misspelled Sentence:\n")
print(misspelled_sentence)
print("\nPOS Tags for Misspelled Sentence:\n")
for word, tag in pos_tags_misspelled:
    print(f"{word}: {tag}")

Misspelled Sentence:

The curent staite of the markett reflekts the hihg uncertanty from investers due to the lateest macroenomic events.

POS Tags for Misspelled Sentence:

The: DT
curent: JJ
staite: NN
of: IN
the: DT
markett: NN
reflekts: VBZ
the: DT
hihg: JJ
uncertanty: NN
from: IN
investers: NNS
due: JJ
to: TO
the: DT
lateest: JJS
macroenomic: JJ
events: NNS
.: .


### 3. Comparing POS Tagging for both sentences

In [74]:
# Prepare data for DataFrame
comparison_data = []
for i in range(min(len(pos_tags_original), len(pos_tags_misspelled))):
    original_word, original_tag = pos_tags_original[i]
    misspelled_word, misspelled_tag = pos_tags_misspelled[i]
    
    # Determine if the word is misspelled and if the tag changed
    is_misspelled = original_word != misspelled_word
    tag_changed = original_tag != misspelled_tag
    tag_status = "CHANGED" if tag_changed else "SAME"
    
    comparison_data.append({
        'Original Sentence Word': original_word,
        'Original Sentence POS Tag': original_tag,
        'Misspelled Sentence Word': misspelled_word,
        'Misspelled Sentence POS Tag': misspelled_tag,
        'Misspelled Word?': "YES" if is_misspelled else "NO",
        'Tag Changed?': tag_status
    })

# Create DataFrame
comparison_df = pd.DataFrame(comparison_data)

print("Comparison of Original Sentence vs. Misspelled Sentence with regard to POS Tags:\n")
display(comparison_df)

# Print summary statistics
misspelled_count = comparison_df['Misspelled Word?'].value_counts().get('YES', 0)
same_tag_count = comparison_df['Tag Changed?'].value_counts().get('SAME', 0)
changed_tag_count = comparison_df['Tag Changed?'].value_counts().get('CHANGED', 0)

print(f"\nSummary Statistics:\n")
print(f"Total Words: {len(comparison_df)}")
print(f"Misspelled Words: {misspelled_count} ({misspelled_count/len(comparison_df)*100:.1f}% of all words)")
print(f"Words with Same POS Tag: {same_tag_count} ({same_tag_count/len(comparison_df)*100:.1f}% of all words)")
print(f"Words with Changed POS Tag: {changed_tag_count} ({changed_tag_count/len(comparison_df)*100:.1f}% of all words)")


Comparison of Original Sentence vs. Misspelled Sentence with regard to POS Tags:



Unnamed: 0,Original Sentence Word,Original Sentence POS Tag,Misspelled Sentence Word,Misspelled Sentence POS Tag,Misspelled Word?,Tag Changed?
0,The,DT,The,DT,NO,SAME
1,current,JJ,curent,JJ,YES,SAME
2,state,NN,staite,NN,YES,SAME
3,of,IN,of,IN,NO,SAME
4,the,DT,the,DT,NO,SAME
5,market,NN,markett,NN,YES,SAME
6,reflects,VBZ,reflekts,VBZ,YES,SAME
7,the,DT,the,DT,NO,SAME
8,high,JJ,hihg,JJ,YES,SAME
9,uncertainty,NN,uncertanty,NN,YES,SAME



Summary Statistics:

Total Words: 19
Misspelled Words: 9 (47.4% of all words)
Words with Same POS Tag: 19 (100.0% of all words)
Words with Changed POS Tag: 0 (0.0% of all words)


When examining the results, we can see that all the words retained exactly the same POS tags in both the original dramatically correct sentence and the misspelled sentence, despite the misspelled sentence having almost 50% of the words misspelled. This suggests that NLTK's POS tagger shows remarkable robustness against spelling errors, even in a more complex sentence with technical terminology (e.g., "macroeconomic").

This behavior occurs because NLTK's POS tagger primarily relies on contextual patterns and statistical models trained on correctly spelled text, rather than 
strict dictionary lookups. When encountering misspelled words, the tagger focuses on sentence structure, word position, morphological cues (e.g., the plural "s" 
in "investers"), and surrounding context to determine the appropriate part of speech, rather than relying just on the exact spelling of each word.

This demonstrates the significant strengths of statistical POS tagging models when dealing with noisy text that contains spelling errors, particularly in more 
complex sentences with domain-specific vocabulary.