In [1]:
import re

def rule_based_pos_tagging(sentence):
    # Define lexical rules
    lexical_rules = [
        (r'\b\w+ly\b', 'RB'),  # Adverbs ending in -ly
        (r'\b[A-Z][a-z]*\b', 'NNP'),  # Proper nouns starting with a capital letter
        (r'\b\w+ing\b', 'VBG'),  # Gerunds or present participles
    ]

    # Define contextual rules
    contextual_rules = [
        (r'\bhe\b \b\w+\b', 'NN'),  # Nouns following the pronoun "he"
    ]

    # Tokenize the sentence into words
    words = sentence.split()

    # Apply lexical rules
    for rule in lexical_rules:
        pattern, tag = rule
        words = [re.sub(pattern, f'{word} ({tag})', word ) for word in words]

    # Apply contextual rules
    for rule in contextual_rules:
        pattern, tag = rule
        words = [re.sub(pattern, tag, ' '.join(words))]

    return words

# Example usage
sentence = "He spoke confidently, running is good exercise."
tagged_sentence = rule_based_pos_tagging(sentence)
print("Sentence: ", sentence)
print("Recognized Tags: ", tagged_sentence)


Sentence:  He spoke confidently, running is good exercise.
Recognized Tags:  ['He (NNP) spoke confidently, (RB), running (VBG) is good exercise.']


In [2]:
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Download the sample dataset from NLTK
nltk.download('treebank')
from nltk.corpus import treebank

# Extract features and labels from the NLTK dataset
def extract_features(word, prev_word):
    return {"word": word, "prev_word": prev_word}

data = [(word, pos) for (word, pos) in treebank.tagged_words()]
features = [(extract_features(data[i][0], '' if i == 0 else data[i-1][0]), pos) for i, (word, pos) in enumerate(data)]

# Split data into features and labels
X = [features[i][0] for i in range(len(features))]
y = [features[i][1] for i in range(len(features))]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize features using DictVectorizer
vectorizer = DictVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Decision Tree classifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train_vectorized, y_train)

# Predict POS tags for the test set
y_pred = classifier.predict(X_test_vectorized)


for i in range(10):
    predicted = classifier.predict(X_test_vectorized[i])
    actuall = y_test[i]
    print("Predicted Tags: ", predicted)
    print("Actuall Tages: ", actuall)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


Predicted Tags:  ['NNP']
Actuall Tages:  NNP
Predicted Tags:  [',']
Actuall Tages:  ,
Predicted Tags:  ['-NONE-']
Actuall Tages:  -NONE-
Predicted Tags:  ['NNP']
Actuall Tages:  NNP
Predicted Tags:  ['.']
Actuall Tages:  .
Predicted Tags:  ['IN']
Actuall Tages:  IN
Predicted Tags:  ['NNP']
Actuall Tages:  NN
Predicted Tags:  ['VB']
Actuall Tages:  VB
Predicted Tags:  ['$']
Actuall Tages:  $
Predicted Tags:  ['CC']
Actuall Tages:  CC
Accuracy: 0.86
